diff --git a/.appveyor.yml b/.appveyor.yml index ba3f8921..7dccb660 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,26 +1,27 @@ -image: Visual Studio 2017 +environment: + TREE_SITTER_TEST: true + +build: false + +install: + - git submodule update --init --recursive + + - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe + - rustup-init -yv --default-toolchain stable + - set PATH=%PATH%;%USERPROFILE%\.cargo\bin + - rustc -vV + - cargo -vV + + - script\fetch-test-fixtures.cmd + +test_script: + - cargo build + - cargo test branches: only: - master -platform: - - x86 - - x64 - -init: - - git config --global core.autocrlf false - -install: - - IF "%PLATFORM%" == "x86" (call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars32.bat") - - IF "%PLATFORM%" == "x64" (call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat") - - script\configure.cmd - - script\fetch-fixtures.cmd - -test_script: - - script\test.cmd - -build: off - cache: - - test\fixtures\grammars + - test\fixtures + - C:\Users\appveyor\.cargo diff --git a/.clang-format b/.clang-format deleted file mode 100644 index 633dfd06..00000000 --- a/.clang-format +++ /dev/null @@ -1,65 +0,0 @@ ---- -Language: Cpp -AccessModifierOffset: -1 -AlignAfterOpenBracket: true -AlignConsecutiveAssignments: false -AlignEscapedNewlinesLeft: true -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 2 -Cpp11BracedListStyle: false -DerivePointerAlignment: true -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -IndentCaseLabels: true -IndentWidth: 2 -IndentWrappedFunctionNames: true -KeepEmptyLinesAtTheStartOfBlocks: true -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 60 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 20 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -SpaceAfterCStyleCast: false -SpaceBeforeAssignmentOperators: true -SpaceBeforeParens: ControlStatements -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Auto -TabWidth: 8 -UseTab: Never -... diff --git a/.clang_complete b/.clang_complete deleted file mode 100644 index 91ab8360..00000000 --- a/.clang_complete +++ /dev/null @@ -1,8 +0,0 @@ --std=c++14 --Isrc --Itest --Iinclude --Iexternals/utf8proc --Iexternals/json-parser --Iexternals/bandit --Iexternals/crypto-algorithms diff --git a/.gitignore b/.gitignore index 7cee7e33..23c82fe6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,22 +1,11 @@ -# Compiled binaries -out -*.a -*.o - -fuzz-results log.html - -# Generated build config files -gyp-mac-tool -Makefile -*.Makefile -*.target.mk - -# IDE files .idea *.xcodeproj - -# Dev dependencies +*.a +*.o +fuzz-results test/fixtures/grammars/* !test/fixtures/grammars/.gitkeep -externals/cpplint.py + +/target +**/*.rs.bk diff --git a/.gitmodules b/.gitmodules index bdbfaaf8..6e45ee19 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,15 +1,3 @@ -[submodule "externals/bandit"] - path = externals/bandit - url = https://github.com/joakimkarlsson/bandit.git -[submodule "externals/gyp"] - path = externals/gyp - url = https://github.com/svn2github/gyp.git [submodule "externals/utf8proc"] - path = externals/utf8proc + path = lib/utf8proc url = https://github.com/julialang/utf8proc -[submodule "externals/json-parser"] - path = externals/json-parser - url = https://github.com/udp/json-parser.git -[submodule "externals/crypto-algorithms"] - path = externals/crypto-algorithms - url = https://github.com/maxbrunsfeld/crypto-algorithms.git diff --git a/.travis.yml b/.travis.yml index b556a5e9..65c021cf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,27 +1,13 @@ -sudo: false -dist: trusty -language: cpp -compiler: -- gcc +language: rust -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-5 - - clang +rust: + - stable -install: -- export CXX="g++-5" -- scan-build script/configure +env: + - TREE_SITTER_TEST=1 -script: -- script/ci - -cache: - directories: - - test/fixtures/grammars +before_install: + - ./script/fetch-test-fixtures.sh branches: only: diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..758dcad7 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,837 @@ +[[package]] +name = "aho-corasick" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "argon2rs" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)", + "scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "arrayvec" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "atty" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "backtrace" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "backtrace-sys" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bitflags" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "blake2-rfc" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "byteorder" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cc" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cfg-if" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "clap" +version = "2.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "crossbeam-channel" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "crossbeam-utils" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "dirs" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure_derive" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", + "synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fnv" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "globset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "hashbrown" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ignore" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", + "globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "indexmap" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "itoa" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lazy_static" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libc" +version = "0.2.44" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libloading" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libsqlite3-sys" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "linked-hash-map" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lock_api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "log" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lru-cache" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memchr" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memoffset" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "nodrop" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "owning_ref" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pkg-config" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "0.4.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_syscall" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "redox_users" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", + "failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rusqlite" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", + "lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ryu" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "same-file" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "scoped_threadpool" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "scopeguard" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "smallbitvec" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "smallvec" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "stable_deref_trait" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "syn" +version = "0.15.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "synstructure" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "termion" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "textwrap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "time" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tree-sitter" +version = "0.3.5" +dependencies = [ + "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tree-sitter-cli" +version = "0.1.0" +dependencies = [ + "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", + "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", + "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.3.5", +] + +[[package]] +name = "ucd-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vcpkg" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "walkdir" +version = "2.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-util" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1e9a933f4e58658d7b12defcf96dc5c720f20832deebe3e0a19efd3b6aaeeb9e" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +"checksum argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3f67b0b6a86dae6e67ff4ca2b6201396074996379fba2b92ff649126f37cb392" +"checksum arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "f405cc4c21cd8b784f6c8fc2adf9bc00f59558f0049b5ec21517f875963040cc" +"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" +"checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" +"checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" +"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" +"checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +"checksum byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "94f88df23a25417badc922ab0f5716cc1330e87f71ddd9203b3a3ccd9cedf75d" +"checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" +"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" +"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" +"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ff012e225ce166d4422e0e78419d901719760f62ae2b7969ca6b564d1b54a9e" +"checksum crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b85741761b7f160bc5e7e0c14986ef685b7f8bf9b7ad081c60c604bb4649827" +"checksum crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2449aaa4ec7ef96e5fb24db16024b935df718e9ae1cec0a1e68feeca2efca7b8" +"checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015" +"checksum crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c55913cc2799171a550e307918c0a360e8c16004820291bf3b638969b4a01816" +"checksum dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88972de891f6118092b643d85a0b28e0678e0f948d7f879aa32f2d5aafe97d2a" +"checksum failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6dd377bcc1b1b7ce911967e3ec24fa19c3224394ec05b54aa7b083d498341ac7" +"checksum failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "64c2d913fe8ed3b6c6518eedf4538255b989945c14c2a7d5cbff62a5e2120596" +"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" +"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" +"checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" +"checksum hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "64b7d419d0622ae02fe5da6b9a5e1964b610a65bb37923b976aeebb6dbb8f86e" +"checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" +"checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" +"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" +"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" +"checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2" +"checksum libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d3711dfd91a1081d2458ad2d06ea30a8755256e74038be2ad927d94e1c955ca8" +"checksum linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7860ec297f7008ff7a1e3382d7f7e1dcd69efc94751a2284bafc3d013c2aa939" +"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" +"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" +"checksum lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d06ff7ff06f729ce5f4e227876cb88d10bc59cd4ae1e09fbb2bde15c850dc21" +"checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" +"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" +"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" +"checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13" +"checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5" +"checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c" +"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" +"checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09" +"checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" +"checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" +"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" +"checksum rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1961a422c4d189dfb50ffa9320bf1f2a9bd54ecb92792fb9477f99a1045f3372" +"checksum rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0905b6b7079ec73b314d4c748701f6931eb79fd97c668caa3f1899b22b32c6db" +"checksum redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)" = "679da7508e9a6390aeaf7fbd02a800fdc64b73fe2204dd2c8ae66d22d9d5ad5d" +"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +"checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26" +"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" +"checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c9d9118f1ce84d8d0b67f9779936432fb42bb620cef2122409d786892cce9a3c" +"checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" +"checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267" +"checksum scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" +"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +"checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef" +"checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" +"checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" +"checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" +"checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" +"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" +"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" +"checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" +"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" +"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" +"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +"checksum time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "d825be0eb33fda1a7e68012d51e9c7f451dc1a69391e7fdc197060bb8c56667b" +"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" +"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" +"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" +"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" +"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" +"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "afc5508759c5bf4285e61feb862b6083c8480aec864fa17a81fdec6f69b461ab" +"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..75d3b403 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[workspace] + +members = [ + "cli", + "lib", +] diff --git a/LICENSE b/LICENSE index b6d8763b..971b81f9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,21 @@ -Copyright 2014 Max Brunsfeld +The MIT License (MIT) -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +Copyright (c) 2018 Max Brunsfeld -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 00000000..6a9c253d --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "tree-sitter-cli" +version = "0.1.0" +authors = ["Max Brunsfeld "] +edition = "2018" + +[[bin]] +name = "tree-sitter" +path = "src/main.rs" + +[dependencies] +lazy_static = "1.2.0" +smallbitvec = "2.3.0" +clap = "2.32" +dirs = "1.0.2" +hashbrown = "0.1" +ignore = "0.4.4" +libloading = "0.5" +rusqlite = "0.14.0" +serde = "1.0" +serde_derive = "1.0" +regex-syntax = "0.6.4" + +[dependencies.tree-sitter] +path = "../lib" + +[dependencies.serde_json] +version = "1.0" +features = ["preserve_order"] + +[dependencies.log] +version = "0.4.6" +features = ["std"] diff --git a/cli/src/build_tables/build_lex_table.rs b/cli/src/build_tables/build_lex_table.rs new file mode 100644 index 00000000..bcc1bf3d --- /dev/null +++ b/cli/src/build_tables/build_lex_table.rs @@ -0,0 +1,278 @@ +use super::item::LookaheadSet; +use super::token_conflicts::TokenConflictMap; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; +use crate::rules::Symbol; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeMap, HashMap, VecDeque}; + +pub(crate) fn build_lex_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + keywords: &LookaheadSet, + minimize: bool, +) -> (LexTable, LexTable) { + let keyword_lex_table; + if syntax_grammar.word_token.is_some() { + let mut builder = LexTableBuilder::new(lexical_grammar); + builder.add_state_for_tokens(keywords); + keyword_lex_table = builder.table; + } else { + keyword_lex_table = LexTable::default(); + } + + let mut builder = LexTableBuilder::new(lexical_grammar); + for state in parse_table.states.iter_mut() { + let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { + if token.is_terminal() { + if keywords.contains(&token) { + syntax_grammar.word_token + } else { + Some(*token) + } + } else if token.is_eof() { + Some(*token) + } else { + None + } + })); + state.lex_state_id = builder.add_state_for_tokens(&tokens); + } + + let mut table = builder.table; + + if minimize { + minimize_lex_table(&mut table, parse_table); + } + + (table, keyword_lex_table) +} + +struct QueueEntry { + state_id: usize, + nfa_states: Vec, + eof_valid: bool, +} + +struct LexTableBuilder<'a> { + lexical_grammar: &'a LexicalGrammar, + cursor: NfaCursor<'a>, + table: LexTable, + state_queue: VecDeque, + state_ids_by_nfa_state_set: HashMap<(Vec, bool), usize>, +} + +impl<'a> LexTableBuilder<'a> { + fn new(lexical_grammar: &'a LexicalGrammar) -> Self { + Self { + lexical_grammar, + cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]), + table: LexTable::default(), + state_queue: VecDeque::new(), + state_ids_by_nfa_state_set: HashMap::new(), + } + } + + fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { + let mut eof_valid = false; + let nfa_states = tokens + .iter() + .filter_map(|token| { + if token.is_terminal() { + Some(self.lexical_grammar.variables[token.index].start_state) + } else { + eof_valid = true; + None + } + }) + .collect(); + let (state_id, is_new) = self.add_state(nfa_states, eof_valid); + + if is_new { + info!( + "entry point state: {}, tokens: {:?}", + state_id, + tokens + .iter() + .map(|t| &self.lexical_grammar.variables[t.index].name) + .collect::>() + ); + } + + while let Some(QueueEntry { + state_id, + nfa_states, + eof_valid, + }) = self.state_queue.pop_front() + { + self.populate_state(state_id, nfa_states, eof_valid); + } + state_id + } + + fn add_state(&mut self, nfa_states: Vec, eof_valid: bool) -> (usize, bool) { + self.cursor.reset(nfa_states); + match self + .state_ids_by_nfa_state_set + .entry((self.cursor.state_ids.clone(), eof_valid)) + { + Entry::Occupied(o) => (*o.get(), false), + Entry::Vacant(v) => { + let state_id = self.table.states.len(); + self.table.states.push(LexState::default()); + self.state_queue.push_back(QueueEntry { + state_id, + nfa_states: v.key().0.clone(), + eof_valid, + }); + v.insert(state_id); + (state_id, true) + } + } + } + + fn populate_state(&mut self, state_id: usize, nfa_states: Vec, eof_valid: bool) { + self.cursor.force_reset(nfa_states); + + // The EOF state is represented as an empty list of NFA states. + let mut completion = None; + for (id, prec) in self.cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if TokenConflictMap::prefer_token( + self.lexical_grammar, + (prev_precedence, prev_id), + (prec, id), + ) { + continue; + } + } + completion = Some((id, prec)); + } + + info!( + "lex state: {}, completion: {:?}", + state_id, + completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec)) + ); + + let transitions = self.cursor.transitions(); + info!("lex state: {}, transitions: {:?}", state_id, transitions); + + // If EOF is a valid lookahead token, add a transition predicated on the null + // character that leads to the empty set of NFA states. + if eof_valid { + let (next_state_id, _) = self.add_state(Vec::new(), false); + info!("lex state: {}, successor: EOF", state_id); + self.table.states[state_id].advance_actions.push(( + CharacterSet::empty().add_char('\0'), + AdvanceAction { + state: Some(next_state_id), + in_main_token: true, + }, + )); + } + + for NfaTransition { + characters, + precedence, + states, + is_separator, + } in transitions + { + if let Some((_, completed_precedence)) = completion { + if precedence < completed_precedence + || (precedence == completed_precedence && is_separator) + { + continue; + } + } + let (next_state_id, _) = self.add_state(states, eof_valid && is_separator); + let next_state = if next_state_id == state_id { + None + } else { + Some(next_state_id) + }; + self.table.states[state_id].advance_actions.push(( + characters, + AdvanceAction { + state: next_state, + in_main_token: !is_separator, + }, + )); + } + + if let Some((complete_id, _)) = completion { + self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id)); + } else if self.cursor.state_ids.is_empty() { + self.table.states[state_id].accept_action = Some(Symbol::end()); + } + } +} + +fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { + let mut state_replacements = BTreeMap::new(); + let mut done = false; + while !done { + done = true; + for (i, state_i) in table.states.iter().enumerate() { + if state_replacements.contains_key(&i) { + continue; + } + for (j, state_j) in table.states.iter().enumerate() { + if j == i { + break; + } + if state_replacements.contains_key(&j) { + continue; + } + if state_i == state_j { + info!("replace state {} with state {}", i, j); + state_replacements.insert(i, j); + done = false; + break; + } + } + } + for state in table.states.iter_mut() { + for (_, advance_action) in state.advance_actions.iter_mut() { + advance_action.state = advance_action + .state + .map(|s| state_replacements.get(&s).cloned().unwrap_or(s)) + } + } + } + + let final_state_replacements = (0..table.states.len()) + .into_iter() + .map(|state_id| { + let replacement = state_replacements + .get(&state_id) + .cloned() + .unwrap_or(state_id); + let prior_removed = state_replacements + .iter() + .take_while(|i| *i.0 < replacement) + .count(); + replacement - prior_removed + }) + .collect::>(); + + for state in parse_table.states.iter_mut() { + state.lex_state_id = final_state_replacements[state.lex_state_id]; + } + + for state in table.states.iter_mut() { + for (_, advance_action) in state.advance_actions.iter_mut() { + advance_action.state = advance_action.state.map(|s| final_state_replacements[s]); + } + } + + let mut i = 0; + table.states.retain(|_| { + let result = !state_replacements.contains_key(&i); + i += 1; + result + }); +} diff --git a/cli/src/build_tables/build_parse_table.rs b/cli/src/build_tables/build_parse_table.rs new file mode 100644 index 00000000..cda1d7ea --- /dev/null +++ b/cli/src/build_tables/build_parse_table.rs @@ -0,0 +1,735 @@ +use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item_set_builder::ParseItemSetBuilder; +use crate::error::{Error, Result}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::{Alias, Associativity, Symbol, SymbolType}; +use crate::tables::{ + AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, +}; +use core::ops::Range; +use hashbrown::hash_map::Entry; +use hashbrown::{HashMap, HashSet}; +use std::collections::hash_map::DefaultHasher; +use std::collections::VecDeque; + +use std::fmt::Write; +use std::hash::Hasher; + +#[derive(Clone)] +struct AuxiliarySymbolInfo { + auxiliary_symbol: Symbol, + parent_symbols: Vec, +} + +type SymbolSequence = Vec; +type AuxiliarySymbolSequence = Vec; + +struct ParseStateQueueEntry { + preceding_symbols: SymbolSequence, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, + state_id: ParseStateId, +} + +struct ParseTableBuilder<'a> { + item_set_builder: ParseItemSetBuilder<'a>, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + state_ids_by_item_set: HashMap, ParseStateId>, + item_sets_by_state_id: Vec>, + parse_state_queue: VecDeque, + parse_table: ParseTable, + following_tokens: Vec, + state_ids_to_log: Vec, +} + +impl<'a> ParseTableBuilder<'a> { + fn build(mut self) -> Result<(ParseTable, Vec)> { + // Ensure that the empty alias sequence has index 0. + self.parse_table.alias_sequences.push(Vec::new()); + + // Add the error state at index 0. + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + + // Add the starting state at index 1. + self.add_parse_state( + &Vec::new(), + &Vec::new(), + ParseItemSet::with( + [( + ParseItem::start(), + LookaheadSet::with([Symbol::end()].iter().cloned()), + )] + .iter() + .cloned(), + ), + ); + + while let Some(entry) = self.parse_state_queue.pop_front() { + let item_set = self + .item_set_builder + .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); + + if self.state_ids_to_log.contains(&entry.state_id) { + eprintln!( + "state: {}\n\ninitial item set:\n\n{}closed item set:\n\n{}", + entry.state_id, + super::item::ParseItemSetDisplay( + &self.item_sets_by_state_id[entry.state_id], + self.syntax_grammar, + self.lexical_grammar, + ), + super::item::ParseItemSetDisplay( + &item_set, + self.syntax_grammar, + self.lexical_grammar, + ) + ); + } + + self.add_actions( + entry.preceding_symbols, + entry.preceding_auxiliary_symbols, + entry.state_id, + item_set, + )?; + } + + self.populate_used_symbols(); + self.remove_precedences(); + + Ok((self.parse_table, self.following_tokens)) + } + + fn add_parse_state( + &mut self, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &AuxiliarySymbolSequence, + item_set: ParseItemSet<'a>, + ) -> ParseStateId { + if preceding_symbols.len() > 1 { + let left_tokens = self + .item_set_builder + .last_set(&preceding_symbols[preceding_symbols.len() - 2]); + let right_tokens = self + .item_set_builder + .first_set(&preceding_symbols[preceding_symbols.len() - 1]); + for left_token in left_tokens.iter() { + if left_token.is_terminal() { + self.following_tokens[left_token.index].insert_all(right_tokens); + } + } + } + + let mut hasher = DefaultHasher::new(); + item_set.hash_unfinished_items(&mut hasher); + let unfinished_item_signature = hasher.finish(); + + match self.state_ids_by_item_set.entry(item_set) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.parse_table.states.len(); + self.item_sets_by_state_id.push(v.key().clone()); + self.parse_table.states.push(ParseState { + lex_state_id: 0, + terminal_entries: HashMap::new(), + nonterminal_entries: HashMap::new(), + unfinished_item_signature, + }); + self.parse_state_queue.push_back(ParseStateQueueEntry { + state_id, + preceding_symbols: preceding_symbols.clone(), + preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), + }); + v.insert(state_id); + state_id + } + } + } + + fn add_actions( + &mut self, + mut preceding_symbols: SymbolSequence, + mut preceding_auxiliary_symbols: Vec, + state_id: ParseStateId, + item_set: ParseItemSet<'a>, + ) -> Result<()> { + let mut terminal_successors = HashMap::new(); + let mut non_terminal_successors = HashMap::new(); + let mut lookaheads_with_conflicts = HashSet::new(); + + for (item, lookaheads) in &item_set.entries { + if let Some(next_symbol) = item.symbol() { + let successor = item.successor(); + if next_symbol.is_non_terminal() { + // Keep track of where auxiliary non-terminals (repeat symbols) are + // used within visible symbols. This information may be needed later + // for conflict resolution. + if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() { + preceding_auxiliary_symbols + .push(self.get_auxiliary_node_info(&item_set, next_symbol)); + } + + non_terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } else { + terminal_successors + .entry(next_symbol) + .or_insert_with(|| ParseItemSet::default()) + .entries + .entry(successor) + .or_insert_with(|| LookaheadSet::new()) + .insert_all(lookaheads); + } + } else { + let action = if item.is_augmented() { + ParseAction::Accept + } else { + ParseAction::Reduce { + symbol: Symbol::non_terminal(item.variable_index as usize), + child_count: item.step_index as usize, + precedence: item.precedence(), + associativity: item.associativity(), + dynamic_precedence: item.production.dynamic_precedence, + alias_sequence_id: self.get_alias_sequence_id(item), + } + }; + + for lookahead in lookaheads.iter() { + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(lookahead); + let entry = entry.or_insert_with(|| ParseTableEntry::new()); + if entry.actions.is_empty() { + entry.actions.push(action); + } else if action.precedence() > entry.actions[0].precedence() { + entry.actions.clear(); + entry.actions.push(action); + lookaheads_with_conflicts.remove(&lookahead); + } else if action.precedence() == entry.actions[0].precedence() { + entry.actions.push(action); + lookaheads_with_conflicts.insert(lookahead); + } + } + } + } + + for (symbol, next_item_set) in terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + + let entry = self.parse_table.states[state_id] + .terminal_entries + .entry(symbol); + if let Entry::Occupied(e) = &entry { + if !e.get().actions.is_empty() { + lookaheads_with_conflicts.insert(symbol); + } + } + + entry + .or_insert_with(|| ParseTableEntry::new()) + .actions + .push(ParseAction::Shift { + state: next_state_id, + is_repetition: false, + }); + } + + for (symbol, next_item_set) in non_terminal_successors { + preceding_symbols.push(symbol); + let next_state_id = self.add_parse_state( + &preceding_symbols, + &preceding_auxiliary_symbols, + next_item_set, + ); + preceding_symbols.pop(); + self.parse_table.states[state_id] + .nonterminal_entries + .insert(symbol, next_state_id); + } + + for symbol in lookaheads_with_conflicts { + self.handle_conflict( + &item_set, + state_id, + &preceding_symbols, + &preceding_auxiliary_symbols, + symbol, + )?; + } + + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + + Ok(()) + } + + fn handle_conflict( + &mut self, + item_set: &ParseItemSet, + state_id: ParseStateId, + preceding_symbols: &SymbolSequence, + preceding_auxiliary_symbols: &Vec, + conflicting_lookahead: Symbol, + ) -> Result<()> { + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + + // Determine which items in the set conflict with each other, and the + // precedences associated with SHIFT vs REDUCE actions. There won't + // be multiple REDUCE actions with different precedences; that is + // sorted out ahead of time in `add_actions`. But there can still be + // REDUCE-REDUCE conflicts where all actions have the *same* + // precedence, and there can still be SHIFT/REDUCE conflicts. + let reduce_precedence = entry.actions[0].precedence(); + let mut considered_associativity = false; + let mut shift_precedence: Option> = None; + let mut conflicting_items = HashSet::new(); + for (item, lookaheads) in &item_set.entries { + if let Some(step) = item.step() { + if item.step_index > 0 { + if self + .item_set_builder + .first_set(&step.symbol) + .contains(&conflicting_lookahead) + { + conflicting_items.insert(item); + let precedence = item.precedence(); + if let Some(range) = &mut shift_precedence { + if precedence < range.start { + range.start = precedence; + } else if precedence > range.end { + range.end = precedence; + } + } else { + shift_precedence = Some(precedence..precedence); + } + } + } + } else if lookaheads.contains(&conflicting_lookahead) { + conflicting_items.insert(item); + } + } + + if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() { + let shift_precedence = shift_precedence.unwrap_or(0..0); + + // If all of the items in the conflict have the same parent symbol, + // and that parent symbols is auxiliary, then this is just the intentional + // ambiguity associated with a repeat rule. Resolve that class of ambiguity + // by leaving it in the parse table, but marking the SHIFT action with + // an `is_repetition` flag. + let conflicting_variable_index = + conflicting_items.iter().next().unwrap().variable_index; + if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() { + if conflicting_items + .iter() + .all(|item| item.variable_index == conflicting_variable_index) + { + *is_repetition = true; + return Ok(()); + } + } + + // If the SHIFT action has higher precedence, remove all the REDUCE actions. + if shift_precedence.start > reduce_precedence + || (shift_precedence.start == reduce_precedence + && shift_precedence.end > reduce_precedence) + { + entry.actions.drain(0..entry.actions.len() - 1); + } + // If the REDUCE actions have higher precedence, remove the SHIFT action. + else if shift_precedence.end < reduce_precedence + || (shift_precedence.end == reduce_precedence + && shift_precedence.start < reduce_precedence) + { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + // If the SHIFT and REDUCE actions have the same predence, consider + // the REDUCE actions' associativity. + else if shift_precedence == (reduce_precedence..reduce_precedence) { + considered_associativity = true; + let mut has_left = false; + let mut has_right = false; + let mut has_non = false; + for action in &entry.actions { + if let ParseAction::Reduce { associativity, .. } = action { + match associativity { + Some(Associativity::Left) => has_left = true, + Some(Associativity::Right) => has_right = true, + None => has_non = true, + } + } + } + + // If all reduce actions are left associative, remove the SHIFT action. + // If all reduce actions are right associative, remove the REDUCE actions. + match (has_left, has_non, has_right) { + (true, false, false) => { + entry.actions.pop(); + conflicting_items.retain(|item| item.is_done()); + } + (false, false, true) => { + entry.actions.drain(0..entry.actions.len() - 1); + } + _ => {} + } + } + } + + // If all of the actions but one have been eliminated, then there's no problem. + let entry = self.parse_table.states[state_id] + .terminal_entries + .get_mut(&conflicting_lookahead) + .unwrap(); + if entry.actions.len() == 1 { + return Ok(()); + } + + // Determine the set of parent symbols involved in this conflict. + let mut actual_conflict = Vec::new(); + for item in &conflicting_items { + let symbol = Symbol::non_terminal(item.variable_index as usize); + if self.syntax_grammar.variables[symbol.index].is_auxiliary() { + actual_conflict.extend( + preceding_auxiliary_symbols + .iter() + .rev() + .find_map(|info| { + if info.auxiliary_symbol == symbol { + Some(&info.parent_symbols) + } else { + None + } + }) + .unwrap() + .iter(), + ); + } else { + actual_conflict.push(symbol); + } + } + actual_conflict.sort_unstable(); + actual_conflict.dedup(); + + // If this set of symbols has been whitelisted, then there's no error. + if self + .syntax_grammar + .expected_conflicts + .contains(&actual_conflict) + { + return Ok(()); + } + + let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string(); + for symbol in preceding_symbols { + write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap(); + } + + write!( + &mut msg, + " • {} …\n\n", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + write!(&mut msg, "Possible interpretations:\n\n").unwrap(); + for (i, item) in conflicting_items.iter().enumerate() { + write!(&mut msg, " {}:", i + 1).unwrap(); + + for preceding_symbol in preceding_symbols + .iter() + .take(preceding_symbols.len() - item.step_index as usize) + { + write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap(); + } + + write!( + &mut msg, + " ({}", + &self.syntax_grammar.variables[item.variable_index as usize].name + ) + .unwrap(); + + for (j, step) in item.production.steps.iter().enumerate() { + if j as u32 == item.step_index { + write!(&mut msg, " •").unwrap(); + } + write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap(); + } + + write!(&mut msg, ")").unwrap(); + + if item.is_done() { + write!( + &mut msg, + " • {}", + self.symbol_name(&conflicting_lookahead) + ) + .unwrap(); + } + + let precedence = item.precedence(); + let associativity = item.associativity(); + if precedence != 0 || associativity.is_some() { + write!( + &mut msg, + "(precedence: {}, associativity: {:?})", + precedence, associativity + ) + .unwrap(); + } + + write!(&mut msg, "\n").unwrap(); + } + + let mut resolution_count = 0; + write!(&mut msg, "\nPossible resolutions:\n\n").unwrap(); + let shift_items = conflicting_items + .iter() + .filter(|i| !i.is_done()) + .cloned() + .collect::>(); + if shift_items.len() > 0 { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in", + resolution_count + ) + .unwrap(); + for (i, item) in shift_items.iter().enumerate() { + if i > 0 { + write!(&mut msg, " and").unwrap(); + } + write!( + &mut msg, + " `{}`", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + write!(&mut msg, " than in the other rules.\n").unwrap(); + } + + if considered_associativity { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a left or right associativity in ", + resolution_count + ) + .unwrap(); + for (i, item) in conflicting_items.iter().filter(|i| i.is_done()).enumerate() { + if i > 0 { + write!(&mut msg, " and ").unwrap(); + } + write!( + &mut msg, + "{}", + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + write!(&mut msg, "\n").unwrap(); + } + + for item in &conflicting_items { + if item.is_done() { + resolution_count += 1; + write!( + &mut msg, + " {}: Specify a higher precedence in `{}` than in the other rules.\n", + resolution_count, + self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)) + ) + .unwrap(); + } + } + + resolution_count += 1; + write!( + &mut msg, + " {}: Add a conflict for these rules: ", + resolution_count + ) + .unwrap(); + for (i, symbol) in actual_conflict.iter().enumerate() { + if i > 0 { + write!(&mut msg, ", ").unwrap(); + } + write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap(); + } + write!(&mut msg, "\n").unwrap(); + + Err(Error(msg)) + } + + fn get_auxiliary_node_info( + &self, + item_set: &ParseItemSet, + symbol: Symbol, + ) -> AuxiliarySymbolInfo { + let parent_symbols = item_set + .entries + .keys() + .filter_map(|item| { + let variable_index = item.variable_index as usize; + if item.symbol() == Some(symbol) + && !self.syntax_grammar.variables[variable_index].is_auxiliary() + { + Some(Symbol::non_terminal(variable_index)) + } else { + None + } + }) + .collect(); + AuxiliarySymbolInfo { + auxiliary_symbol: symbol, + parent_symbols, + } + } + + fn populate_used_symbols(&mut self) { + let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; + let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; + let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; + for state in &self.parse_table.states { + for symbol in state.terminal_entries.keys() { + match symbol.kind { + SymbolType::Terminal => terminal_usages[symbol.index] = true, + SymbolType::External => external_usages[symbol.index] = true, + _ => {} + } + } + for symbol in state.nonterminal_entries.keys() { + non_terminal_usages[symbol.index] = true; + } + } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } + self.parse_table.symbols.push(Symbol::end()); + for (i, value) in terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::terminal(i)); + } + } + for (i, value) in non_terminal_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::non_terminal(i)); + } + } + } + + fn remove_precedences(&mut self) { + for state in self.parse_table.states.iter_mut() { + for (_, entry) in state.terminal_entries.iter_mut() { + for action in entry.actions.iter_mut() { + match action { + ParseAction::Reduce { + precedence, + associativity, + .. + } => { + *precedence = 0; + *associativity = None; + } + _ => {} + } + } + } + } + } + + fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { + let mut alias_sequence: Vec> = item + .production + .steps + .iter() + .map(|s| s.alias.clone()) + .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } + if item.production.steps.len() > self.parse_table.max_aliased_production_length { + self.parse_table.max_aliased_production_length = item.production.steps.len() + } + if let Some(index) = self + .parse_table + .alias_sequences + .iter() + .position(|seq| *seq == alias_sequence) + { + index + } else { + self.parse_table.alias_sequences.push(alias_sequence); + self.parse_table.alias_sequences.len() - 1 + } + } + + fn symbol_name(&self, symbol: &Symbol) -> String { + match symbol.kind { + SymbolType::End => "EOF".to_string(), + SymbolType::External => self.syntax_grammar.external_tokens[symbol.index] + .name + .clone(), + SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(), + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + if variable.kind == VariableType::Named { + variable.name.clone() + } else { + format!("\"{}\"", &variable.name) + } + } + } + } +} + +pub(crate) fn build_parse_table( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + inlines: &InlinedProductionMap, + state_ids_to_log: Vec, +) -> Result<(ParseTable, Vec)> { + ParseTableBuilder { + syntax_grammar, + lexical_grammar, + state_ids_to_log, + item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), + state_ids_by_item_set: HashMap::new(), + item_sets_by_state_id: Vec::new(), + parse_state_queue: VecDeque::new(), + parse_table: ParseTable { + states: Vec::new(), + symbols: Vec::new(), + alias_sequences: Vec::new(), + max_aliased_production_length: 0, + }, + following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], + } + .build() +} diff --git a/cli/src/build_tables/coincident_tokens.rs b/cli/src/build_tables/coincident_tokens.rs new file mode 100644 index 00000000..62295073 --- /dev/null +++ b/cli/src/build_tables/coincident_tokens.rs @@ -0,0 +1,71 @@ +use crate::grammars::LexicalGrammar; +use crate::rules::Symbol; +use crate::tables::{ParseStateId, ParseTable}; +use std::fmt; + +pub(crate) struct CoincidentTokenIndex<'a> { + entries: Vec>, + grammar: &'a LexicalGrammar, + n: usize, +} + +impl<'a> CoincidentTokenIndex<'a> { + pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self { + let n = lexical_grammar.variables.len(); + let mut result = Self { + n, + grammar: lexical_grammar, + entries: vec![Vec::new(); n * n], + }; + for (i, state) in table.states.iter().enumerate() { + for symbol in state.terminal_entries.keys() { + for other_symbol in state.terminal_entries.keys() { + let index = result.index(symbol.index, other_symbol.index); + if result.entries[index].last().cloned() != Some(i) { + result.entries[index].push(i); + } + } + } + } + result + } + + pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { + &self.entries[self.index(a.index, b.index)] + } + + pub fn contains(&self, a: Symbol, b: Symbol) -> bool { + !self.entries[self.index(a.index, b.index)].is_empty() + } + + fn index(&self, a: usize, b: usize) -> usize { + if a < b { + a * self.n + b + } else { + b * self.n + a + } + } +} + +impl<'a> fmt::Debug for CoincidentTokenIndex<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CoincidentTokenIndex {{\n")?; + + write!(f, " entries: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + self.grammar.variables[j].name, + self.entries[self.index(i, j)].len() + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} diff --git a/cli/src/build_tables/item.rs b/cli/src/build_tables/item.rs new file mode 100644 index 00000000..bbd5bbfa --- /dev/null +++ b/cli/src/build_tables/item.rs @@ -0,0 +1,446 @@ +use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; +use crate::rules::Associativity; +use crate::rules::{Symbol, SymbolType}; +use smallbitvec::SmallBitVec; +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::u32; + +lazy_static! { + static ref START_PRODUCTION: Production = Production { + dynamic_precedence: 0, + steps: vec![ProductionStep { + symbol: Symbol { + index: 0, + kind: SymbolType::NonTerminal, + }, + precedence: 0, + associativity: None, + alias: None, + }], + }; +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct LookaheadSet { + terminal_bits: SmallBitVec, + external_bits: SmallBitVec, + eof: bool, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct ParseItem<'a> { + pub variable_index: u32, + pub step_index: u32, + pub production: &'a Production, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ParseItemSet<'a> { + pub entries: BTreeMap, LookaheadSet>, +} + +pub(crate) struct ParseItemDisplay<'a>( + pub &'a ParseItem<'a>, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar, +); + +pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); + +#[allow(dead_code)] +pub(crate) struct ParseItemSetDisplay<'a>( + pub &'a ParseItemSet<'a>, + pub &'a SyntaxGrammar, + pub &'a LexicalGrammar, +); + +impl LookaheadSet { + pub fn new() -> Self { + Self { + terminal_bits: SmallBitVec::new(), + external_bits: SmallBitVec::new(), + eof: false, + } + } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + .chain( + self.external_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::external(i)) + } else { + None + } + }), + ) + .chain(if self.eof { Some(Symbol::end()) } else { None }) + } + + pub fn with(symbols: impl IntoIterator) -> Self { + let mut result = Self::new(); + for symbol in symbols { + result.insert(symbol); + } + result + } + + pub fn contains(&self, symbol: &Symbol) -> bool { + match symbol.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), + SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), + SymbolType::End => self.eof, + } + } + + pub fn insert(&mut self, other: Symbol) { + let vec = match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => &mut self.terminal_bits, + SymbolType::External => &mut self.external_bits, + SymbolType::End => { + self.eof = true; + return; + } + }; + if other.index >= vec.len() { + vec.resize(other.index + 1, false); + } + vec.set(other.index, true); + } + + pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { + let mut result = false; + if other.terminal_bits.len() > self.terminal_bits.len() { + self.terminal_bits.resize(other.terminal_bits.len(), false); + } + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } + for (i, element) in other.terminal_bits.iter().enumerate() { + if element { + result |= !self.terminal_bits[i]; + self.terminal_bits.set(i, element); + } + } + for (i, element) in other.external_bits.iter().enumerate() { + if element { + result |= !self.external_bits[i]; + self.external_bits.set(i, element); + } + } + if other.eof { + result |= !self.eof; + self.eof = true; + } + result + } +} + +impl<'a> ParseItem<'a> { + pub fn start() -> Self { + ParseItem { + variable_index: u32::MAX, + production: &START_PRODUCTION, + step_index: 0, + } + } + + pub fn step(&self) -> Option<&'a ProductionStep> { + self.production.steps.get(self.step_index as usize) + } + + pub fn symbol(&self) -> Option { + self.step().map(|step| step.symbol) + } + + pub fn associativity(&self) -> Option { + self.prev_step().and_then(|step| step.associativity) + } + + pub fn precedence(&self) -> i32 { + self.prev_step().map_or(0, |step| step.precedence) + } + + pub fn prev_step(&self) -> Option<&'a ProductionStep> { + if self.step_index > 0 { + Some(&self.production.steps[self.step_index as usize - 1]) + } else { + None + } + } + + pub fn is_done(&self) -> bool { + self.step_index as usize == self.production.steps.len() + } + + pub fn is_augmented(&self) -> bool { + self.variable_index == u32::MAX + } + + pub fn successor(&self) -> ParseItem<'a> { + ParseItem { + variable_index: self.variable_index, + production: self.production, + step_index: self.step_index + 1, + } + } +} + +impl<'a> ParseItemSet<'a> { + pub fn with(elements: impl IntoIterator, LookaheadSet)>) -> Self { + let mut result = Self::default(); + for (item, lookaheads) in elements { + result.entries.insert(item, lookaheads); + } + result + } + + pub fn hash_unfinished_items(&self, h: &mut impl Hasher) { + let mut previous_variable_index = u32::MAX; + let mut previous_step_index = u32::MAX; + for item in self.entries.keys() { + if item.step().is_none() && item.variable_index != previous_variable_index + || item.step_index != previous_step_index + { + h.write_u32(item.variable_index); + h.write_u32(item.step_index); + previous_variable_index = item.variable_index; + previous_step_index = item.step_index; + } + } + } +} + +impl<'a> Default for ParseItemSet<'a> { + fn default() -> Self { + Self { + entries: BTreeMap::new(), + } + } +} + +#[allow(dead_code)] +impl<'a> fmt::Display for ParseItemDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + if self.0.is_augmented() { + write!(f, "START →")?; + } else { + write!( + f, + "{} →", + &self.1.variables[self.0.variable_index as usize].name + )?; + } + + for (i, step) in self.0.production.steps.iter().enumerate() { + if i == self.0.step_index as usize { + write!(f, " •")?; + if step.precedence != 0 || step.associativity.is_some() { + write!( + f, + " (prec {:?} assoc {:?})", + step.precedence, step.associativity + )?; + } + } + + write!(f, " ")?; + if step.symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(step.symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", step.symbol.index)?; + } + } else if step.symbol.is_external() { + write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?; + } else { + write!(f, "{}", &self.1.variables[step.symbol.index].name)?; + } + + if let Some(alias) = &step.alias { + write!(f, " (alias {})", alias.value)?; + } + } + + if self.0.is_done() { + write!(f, " •")?; + if let Some(step) = self.0.production.steps.last() { + if step.precedence != 0 || step.associativity.is_some() { + write!( + f, + " (prec {:?} assoc {:?})", + step.precedence, step.associativity + )?; + } + } + } + + Ok(()) + } +} + +impl<'a> fmt::Display for LookaheadSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "[")?; + for (i, symbol) in self.0.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + + if symbol.is_terminal() { + if let Some(variable) = self.2.variables.get(symbol.index) { + write!(f, "{}", &variable.name)?; + } else { + write!(f, "{}-{}", "terminal", symbol.index)?; + } + } else if symbol.is_external() { + write!(f, "{}", &self.1.external_tokens[symbol.index].name)?; + } else { + write!(f, "{}", &self.1.variables[symbol.index].name)?; + } + } + write!(f, "]")?; + Ok(()) + } +} + +impl<'a> fmt::Display for ParseItemSetDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + for (item, lookaheads) in self.0.entries.iter() { + writeln!( + f, + "{}\t{}", + ParseItemDisplay(item, self.1, self.2), + LookaheadSetDisplay(lookaheads, self.1, self.2) + )?; + } + Ok(()) + } +} + +impl<'a> Hash for ParseItem<'a> { + fn hash(&self, hasher: &mut H) { + hasher.write_u32(self.variable_index); + hasher.write_u32(self.step_index); + hasher.write_i32(self.production.dynamic_precedence); + hasher.write_usize(self.production.steps.len()); + hasher.write_i32(self.precedence()); + self.associativity().hash(hasher); + for step in &self.production.steps[0..self.step_index as usize] { + step.alias.hash(hasher); + } + for step in &self.production.steps[self.step_index as usize..] { + step.hash(hasher); + } + } +} + +impl<'a> PartialEq for ParseItem<'a> { + fn eq(&self, other: &Self) -> bool { + if self.variable_index != other.variable_index + || self.step_index != other.step_index + || self.production.dynamic_precedence != other.production.dynamic_precedence + || self.production.steps.len() != other.production.steps.len() + || self.precedence() != other.precedence() + || self.associativity() != other.associativity() + { + return false; + } + + for (i, step) in self.production.steps.iter().enumerate() { + if i < self.step_index as usize { + if step.alias != other.production.steps[i].alias { + return false; + } + } else { + if *step != other.production.steps[i] { + return false; + } + } + } + + return true; + } +} + +impl<'a> Ord for ParseItem<'a> { + fn cmp(&self, other: &Self) -> Ordering { + let o = self.variable_index.cmp(&other.variable_index); + if o != Ordering::Equal { + return o; + } + let o = self.step_index.cmp(&other.step_index); + if o != Ordering::Equal { + return o; + } + let o = self + .production + .dynamic_precedence + .cmp(&other.production.dynamic_precedence); + if o != Ordering::Equal { + return o; + } + let o = self + .production + .steps + .len() + .cmp(&other.production.steps.len()); + if o != Ordering::Equal { + return o; + } + let o = self.precedence().cmp(&other.precedence()); + if o != Ordering::Equal { + return o; + } + let o = self.associativity().cmp(&other.associativity()); + if o != Ordering::Equal { + return o; + } + for (i, step) in self.production.steps.iter().enumerate() { + let o = if i < self.step_index as usize { + step.alias.cmp(&other.production.steps[i].alias) + } else { + step.cmp(&other.production.steps[i]) + }; + if o != Ordering::Equal { + return o; + } + } + return Ordering::Equal; + } +} + +impl<'a> PartialOrd for ParseItem<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> Eq for ParseItem<'a> {} + +impl<'a> Hash for ParseItemSet<'a> { + fn hash(&self, hasher: &mut H) { + hasher.write_usize(self.entries.len()); + for (item, lookaheads) in self.entries.iter() { + item.hash(hasher); + lookaheads.hash(hasher); + } + } +} diff --git a/cli/src/build_tables/item_set_builder.rs b/cli/src/build_tables/item_set_builder.rs new file mode 100644 index 00000000..939d700c --- /dev/null +++ b/cli/src/build_tables/item_set_builder.rs @@ -0,0 +1,330 @@ +use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet}; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::Symbol; +use hashbrown::{HashMap, HashSet}; +use std::fmt; + +#[derive(Clone, Debug, PartialEq, Eq)] +struct TransitiveClosureAddition<'a> { + item: ParseItem<'a>, + info: FollowSetInfo, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +struct FollowSetInfo { + lookaheads: LookaheadSet, + propagates_lookaheads: bool, +} + +pub(crate) struct ParseItemSetBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + first_sets: HashMap, + last_sets: HashMap, + inlines: &'a InlinedProductionMap, + transitive_closure_additions: Vec>>, +} + +fn find_or_push(vector: &mut Vec, value: T) { + if !vector.contains(&value) { + vector.push(value); + } +} + +impl<'a> ParseItemSetBuilder<'a> { + pub fn new( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + ) -> Self { + let mut result = Self { + syntax_grammar, + lexical_grammar, + first_sets: HashMap::new(), + last_sets: HashMap::new(), + inlines, + transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], + }; + + // For each grammar symbol, populate the FIRST and LAST sets: the set of + // terminals that appear at the beginning and end that symbol's productions, + // respectively. + // + // For a terminal symbol, the FIRST and LAST set just consists of the + // terminal itself. + for i in 0..lexical_grammar.variables.len() { + let symbol = Symbol::terminal(i); + let mut set = LookaheadSet::new(); + set.insert(symbol); + result.first_sets.insert(symbol, set.clone()); + result.last_sets.insert(symbol, set); + } + + for i in 0..syntax_grammar.external_tokens.len() { + let symbol = Symbol::external(i); + let mut set = LookaheadSet::new(); + set.insert(symbol); + result.first_sets.insert(symbol, set.clone()); + result.last_sets.insert(symbol, set); + } + + // The FIRST set of a non-terminal `i` is the union of the following sets: + // * the set of all terminals that appear at the beginings of i's productions + // * the FIRST sets of all the non-terminals that appear at the beginnings + // of i's productions + // + // Rather than computing these sets using recursion, we use an explicit stack + // called `symbols_to_process`. + let mut symbols_to_process = Vec::new(); + let mut processed_non_terminals = HashSet::new(); + for i in 0..syntax_grammar.variables.len() { + let symbol = Symbol::non_terminal(i); + + let first_set = &mut result + .first_sets + .entry(symbol) + .or_insert(LookaheadSet::new()); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push(symbol); + while let Some(current_symbol) = symbols_to_process.pop() { + if current_symbol.is_terminal() || current_symbol.is_external() { + first_set.insert(current_symbol); + } else if processed_non_terminals.insert(current_symbol) { + for production in syntax_grammar.variables[current_symbol.index] + .productions + .iter() + { + if let Some(step) = production.steps.first() { + symbols_to_process.push(step.symbol); + } + } + } + } + + // The LAST set is defined in a similar way to the FIRST set. + let last_set = &mut result + .last_sets + .entry(symbol) + .or_insert(LookaheadSet::new()); + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push(symbol); + while let Some(current_symbol) = symbols_to_process.pop() { + if current_symbol.is_terminal() || current_symbol.is_external() { + last_set.insert(current_symbol); + } else if processed_non_terminals.insert(current_symbol) { + for production in syntax_grammar.variables[current_symbol.index] + .productions + .iter() + { + if let Some(step) = production.steps.last() { + symbols_to_process.push(step.symbol); + } + } + } + } + } + + // To compute an item set's transitive closure, we find each item in the set + // whose next symbol is a non-terminal, and we add new items to the set for + // each of that symbols' productions. These productions might themselves begin + // with non-terminals, so the process continues recursively. In this process, + // the total set of entries that get added depends only on two things: + // * the set of non-terminal symbols that occur at each item's current position + // * the set of terminals that occurs after each of these non-terminal symbols + // + // So we can avoid a lot of duplicated recursive work by precomputing, for each + // non-terminal symbol `i`, a final list of *additions* that must be made to an + // item set when `i` occurs as the next symbol in one if its core items. The + // structure of an *addition* is as follows: + // * `item` - the new item that must be added as part of the expansion of `i` + // * `lookaheads` - lookahead tokens that can always come after that item in + // the expansion of `i` + // * `propagates_lookaheads` - a boolean indicating whether or not `item` can + // occur at the *end* of the expansion of `i`, so that i's own current + // lookahead tokens can occur after `item`. + // + // Again, rather than computing these additions recursively, we use an explicit + // stack called `entries_to_process`. + for i in 0..syntax_grammar.variables.len() { + let empty_lookaheads = LookaheadSet::new(); + let mut entries_to_process = vec![(i, &empty_lookaheads, true)]; + + // First, build up a map whose keys are all of the non-terminals that can + // appear at the beginning of non-terminal `i`, and whose values store + // information about the tokens that can follow each non-terminal. + let mut follow_set_info_by_non_terminal = HashMap::new(); + while let Some(entry) = entries_to_process.pop() { + let (variable_index, lookaheads, propagates_lookaheads) = entry; + let existing_info = follow_set_info_by_non_terminal + .entry(variable_index) + .or_insert_with(|| FollowSetInfo { + lookaheads: LookaheadSet::new(), + propagates_lookaheads: false, + }); + + let did_add_follow_set_info; + if propagates_lookaheads { + did_add_follow_set_info = !existing_info.propagates_lookaheads; + existing_info.propagates_lookaheads = true; + } else { + did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads); + } + + if did_add_follow_set_info { + for production in &syntax_grammar.variables[variable_index].productions { + if let Some(symbol) = production.first_symbol() { + if symbol.is_non_terminal() { + if production.steps.len() == 1 { + entries_to_process.push(( + symbol.index, + lookaheads, + propagates_lookaheads, + )); + } else { + entries_to_process.push(( + symbol.index, + &result.first_sets[&production.steps[1].symbol], + false, + )); + } + } + } + } + } + } + + // Store all of those non-terminals' productions, along with their associated + // lookahead info, as *additions* associated with non-terminal `i`. + let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; + for (variable_index, follow_set_info) in follow_set_info_by_non_terminal { + let variable = &syntax_grammar.variables[variable_index]; + let non_terminal = Symbol::non_terminal(variable_index); + let variable_index = variable_index as u32; + if syntax_grammar.variables_to_inline.contains(&non_terminal) { + continue; + } + for production in &variable.productions { + let item = ParseItem { + variable_index, + production, + step_index: 0, + }; + + if let Some(inlined_productions) = + inlines.inlined_productions(item.production, item.step_index) + { + for production in inlined_productions { + find_or_push( + additions_for_non_terminal, + TransitiveClosureAddition { + item: ParseItem { + variable_index, + production, + step_index: item.step_index, + }, + info: follow_set_info.clone(), + }, + ); + } + } else { + find_or_push( + additions_for_non_terminal, + TransitiveClosureAddition { + item, + info: follow_set_info.clone(), + }, + ); + } + } + } + } + + result + } + + pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> { + let mut result = ParseItemSet::default(); + for (item, lookaheads) in &item_set.entries { + if let Some(productions) = self + .inlines + .inlined_productions(item.production, item.step_index) + { + for production in productions { + self.add_item( + &mut result, + ParseItem { + variable_index: item.variable_index, + production, + step_index: item.step_index, + }, + lookaheads, + ); + } + } else { + self.add_item(&mut result, *item, lookaheads); + } + } + result + } + + pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + + pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet { + &self.first_sets[symbol] + } + + fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) { + if let Some(step) = item.step() { + if step.symbol.is_non_terminal() { + let next_step = item.successor().step(); + + // Determine which tokens can follow this non-terminal. + let following_tokens = if let Some(next_step) = next_step { + self.first_sets.get(&next_step.symbol).unwrap() + } else { + &lookaheads + }; + + // Use the pre-computed *additions* to expand the non-terminal. + for addition in &self.transitive_closure_additions[step.symbol.index] { + let lookaheads = set + .entries + .entry(addition.item) + .or_insert_with(|| LookaheadSet::new()); + lookaheads.insert_all(&addition.info.lookaheads); + if addition.info.propagates_lookaheads { + lookaheads.insert_all(following_tokens); + } + } + } + } + set.entries.insert(item, lookaheads.clone()); + } +} + +impl<'a> fmt::Debug for ParseItemSetBuilder<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "ParseItemSetBuilder {{\n")?; + + write!(f, " additions: {{\n")?; + for (i, variable) in self.syntax_grammar.variables.iter().enumerate() { + write!(f, " {}: {{\n", variable.name)?; + for addition in &self.transitive_closure_additions[i] { + write!( + f, + " {}\n", + ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar) + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + + write!(f, "}}")?; + Ok(()) + } +} diff --git a/cli/src/build_tables/minimize_parse_table.rs b/cli/src/build_tables/minimize_parse_table.rs new file mode 100644 index 00000000..573bf974 --- /dev/null +++ b/cli/src/build_tables/minimize_parse_table.rs @@ -0,0 +1,281 @@ +use super::item::LookaheadSet; +use super::token_conflicts::TokenConflictMap; +use crate::grammars::{SyntaxGrammar, VariableType}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; +use hashbrown::{HashMap, HashSet}; + +pub(crate) fn minimize_parse_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + simple_aliases: &AliasMap, + token_conflict_map: &TokenConflictMap, + keywords: &LookaheadSet, +) { + let mut minimizer = Minimizer { + parse_table, + syntax_grammar, + token_conflict_map, + keywords, + simple_aliases, + }; + minimizer.remove_unit_reductions(); + minimizer.merge_compatible_states(); + minimizer.remove_unused_states(); +} + +struct Minimizer<'a> { + parse_table: &'a mut ParseTable, + syntax_grammar: &'a SyntaxGrammar, + token_conflict_map: &'a TokenConflictMap<'a>, + keywords: &'a LookaheadSet, + simple_aliases: &'a AliasMap, +} + +impl<'a> Minimizer<'a> { + fn remove_unit_reductions(&mut self) { + let mut aliased_symbols = HashSet::new(); + for variable in &self.syntax_grammar.variables { + for production in &variable.productions { + for step in &production.steps { + if step.alias.is_some() { + aliased_symbols.insert(step.symbol); + } + } + } + } + + let mut unit_reduction_symbols_by_state = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + let mut only_unit_reductions = true; + let mut unit_reduction_symbol = None; + for (_, entry) in &state.terminal_entries { + for action in &entry.actions { + match action { + ParseAction::ShiftExtra => continue, + ParseAction::Reduce { + child_count: 1, + alias_sequence_id: 0, + symbol, + .. + } => { + if !self.simple_aliases.contains_key(&symbol) + && !aliased_symbols.contains(&symbol) + && self.syntax_grammar.variables[symbol.index].kind + != VariableType::Named + && (unit_reduction_symbol.is_none() + || unit_reduction_symbol == Some(symbol)) + { + unit_reduction_symbol = Some(symbol); + continue; + } + } + _ => {} + } + only_unit_reductions = false; + break; + } + + if !only_unit_reductions { + break; + } + } + + if let Some(symbol) = unit_reduction_symbol { + if only_unit_reductions { + unit_reduction_symbols_by_state.insert(i, *symbol); + } + } + } + + for state in self.parse_table.states.iter_mut() { + let mut done = false; + while !done { + done = true; + state.update_referenced_states(|other_state_id, state| { + if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) { + done = false; + state.nonterminal_entries[symbol] + } else { + other_state_id + } + }) + } + } + } + + fn merge_compatible_states(&mut self) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in self.parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { + continue; + } + if self.merge_parse_state(*j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } + } + } + } + + if state_replacements.is_empty() { + break; + } + + for state in self.parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); + } + } + } + + fn merge_parse_state(&mut self, left: usize, right: usize) -> bool { + let left_state = &self.parse_table.states[left]; + let right_state = &self.parse_table.states[right]; + + if left_state.nonterminal_entries != right_state.nonterminal_entries { + return false; + } + + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) { + return false; + } + } + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !self.can_add_entry_to_state(left_state, *symbol, right_entry) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = self.parse_table.states[right].terminal_entries[&symbol].clone(); + self.parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true + } + + fn can_add_entry_to_state( + &self, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, + ) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; + } + + // Only merge_compatible_states parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if self + .syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + let is_word_token = self.syntax_grammar.word_token == Some(token); + let is_keyword = self.keywords.contains(&token); + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if (is_word_token && self.keywords.contains(existing_token)) + || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token) + { + continue; + } + if self + .token_conflict_map + .does_conflict(token.index, existing_token.index) + || self + .token_conflict_map + .does_match_same_string(token.index, existing_token.index) + { + return false; + } + } + } + + true + } + + fn remove_unused_states(&mut self) { + let mut state_usage_map = vec![false; self.parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + + for state in &self.parse_table.states { + for referenced_state in state.referenced_states() { + state_usage_map[referenced_state] = true; + } + } + let mut removed_predecessor_count = 0; + let mut state_replacement_map = vec![0; self.parse_table.states.len()]; + for state_id in 0..self.parse_table.states.len() { + state_replacement_map[state_id] = state_id - removed_predecessor_count; + if !state_usage_map[state_id] { + removed_predecessor_count += 1; + } + } + let mut state_id = 0; + let mut original_state_id = 0; + while state_id < self.parse_table.states.len() { + if state_usage_map[original_state_id] { + self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| { + state_replacement_map[other_state_id] + }); + state_id += 1; + } else { + self.parse_table.states.remove(state_id); + } + original_state_id += 1; + } + } +} diff --git a/cli/src/build_tables/mod.rs b/cli/src/build_tables/mod.rs new file mode 100644 index 00000000..04b750e3 --- /dev/null +++ b/cli/src/build_tables/mod.rs @@ -0,0 +1,285 @@ +mod build_lex_table; +mod build_parse_table; +mod coincident_tokens; +mod item; +mod item_set_builder; +mod minimize_parse_table; +mod token_conflicts; + +use self::build_lex_table::build_lex_table; +use self::build_parse_table::build_parse_table; +use self::coincident_tokens::CoincidentTokenIndex; +use self::item::LookaheadSet; +use self::minimize_parse_table::minimize_parse_table; +use self::token_conflicts::TokenConflictMap; +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::nfa::{CharacterSet, NfaCursor}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; + +pub(crate) fn build_tables( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + simple_aliases: &AliasMap, + inlines: &InlinedProductionMap, + minimize: bool, + state_ids_to_log: Vec, +) -> Result<(ParseTable, LexTable, LexTable, Option)> { + let (mut parse_table, following_tokens) = + build_parse_table(syntax_grammar, lexical_grammar, inlines, state_ids_to_log)?; + let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); + let keywords = identify_keywords( + lexical_grammar, + &parse_table, + syntax_grammar.word_token, + &token_conflict_map, + &coincident_token_index, + ); + populate_error_state( + &mut parse_table, + syntax_grammar, + lexical_grammar, + &coincident_token_index, + &token_conflict_map, + ); + mark_fragile_tokens( + &mut parse_table, + lexical_grammar, + &token_conflict_map, + ); + if minimize { + minimize_parse_table( + &mut parse_table, + syntax_grammar, + simple_aliases, + &token_conflict_map, + &keywords, + ); + } + let (main_lex_table, keyword_lex_table) = build_lex_table( + &mut parse_table, + syntax_grammar, + lexical_grammar, + &keywords, + minimize, + ); + Ok(( + parse_table, + main_lex_table, + keyword_lex_table, + syntax_grammar.word_token, + )) +} + +fn populate_error_state( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + coincident_token_index: &CoincidentTokenIndex, + token_conflict_map: &TokenConflictMap, +) { + let state = &mut parse_table.states[0]; + let n = lexical_grammar.variables.len(); + + // First identify the *conflict-free tokens*: tokens that do not overlap with + // any other token in any way. + let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { + let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { + j != i + && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + && token_conflict_map.does_conflict(i, j) + }); + if conflicts_with_other_tokens { + None + } else { + info!( + "error recovery - token {} has no conflicts", + lexical_grammar.variables[i].name + ); + Some(Symbol::terminal(i)) + } + })); + + let recover_entry = ParseTableEntry { + reusable: false, + actions: vec![ParseAction::Recover], + }; + + // Exclude from the error-recovery state any token that conflicts with one of + // the *conflict-free tokens* identified above. + for i in 0..n { + let symbol = Symbol::terminal(i); + if !conflict_free_tokens.contains(&symbol) { + if syntax_grammar.word_token != Some(symbol) { + if let Some(t) = conflict_free_tokens.iter().find(|t| { + !coincident_token_index.contains(symbol, *t) + && token_conflict_map.does_conflict(symbol.index, t.index) + }) { + info!( + "error recovery - exclude token {} because of conflict with {}", + lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name + ); + continue; + } + } + } + info!( + "error recovery - include token {}", + lexical_grammar.variables[i].name + ); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); + } + + for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { + if external_token.corresponding_internal_token.is_none() { + state + .terminal_entries + .entry(Symbol::external(i)) + .or_insert_with(|| recover_entry.clone()); + } + } + + state.terminal_entries.insert(Symbol::end(), recover_entry); +} + +fn identify_keywords( + lexical_grammar: &LexicalGrammar, + parse_table: &ParseTable, + word_token: Option, + token_conflict_map: &TokenConflictMap, + coincident_token_index: &CoincidentTokenIndex, +) -> LookaheadSet { + if word_token.is_none() { + return LookaheadSet::new(); + } + + let word_token = word_token.unwrap(); + let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); + + // First find all of the candidate keyword tokens: tokens that start with + // letters or underscore and can match the same string as a word token. + let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map( + |(i, variable)| { + cursor.reset(vec![variable.start_state]); + if all_chars_are_alphabetical(&cursor) + && token_conflict_map.does_match_same_string(i, word_token.index) + { + info!( + "Keywords - add candidate {}", + lexical_grammar.variables[i].name + ); + Some(Symbol::terminal(i)) + } else { + None + } + }, + )); + + // Exclude keyword candidates that shadow another keyword candidate. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_token in keywords.iter() { + if other_token != *token + && token_conflict_map.does_match_same_string(token.index, other_token.index) + { + info!( + "Keywords - exclude {} because it matches the same string as {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_token.index].name + ); + return false; + } + } + true + })); + + // Exclude keyword candidates for which substituting the keyword capture + // token would introduce new lexical conflicts with other tokens. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_index in 0..lexical_grammar.variables.len() { + if keywords.contains(&Symbol::terminal(other_index)) { + continue; + } + + // If the word token was already valid in every state containing + // this keyword candidate, then substituting the word token won't + // introduce any new lexical conflicts. + if coincident_token_index + .states_with(*token, Symbol::terminal(other_index)) + .iter() + .all(|state_id| { + parse_table.states[*state_id] + .terminal_entries + .contains_key(&word_token) + }) + { + continue; + } + + if !token_conflict_map.has_same_conflict_status( + token.index, + word_token.index, + other_index, + ) { + info!( + "Keywords - exclude {} because of conflict with {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_index].name + ); + return false; + } + } + + info!( + "Keywords - include {}", + lexical_grammar.variables[token.index].name, + ); + true + })); + + keywords +} + +fn mark_fragile_tokens( + parse_table: &mut ParseTable, + lexical_grammar: &LexicalGrammar, + token_conflict_map: &TokenConflictMap, +) { + let n = lexical_grammar.variables.len(); + let mut valid_tokens_mask = Vec::with_capacity(n); + for state in parse_table.states.iter_mut() { + valid_tokens_mask.clear(); + valid_tokens_mask.resize(n, false); + for token in state.terminal_entries.keys() { + if token.is_terminal() { + valid_tokens_mask[token.index] = true; + } + } + for (token, entry) in state.terminal_entries.iter_mut() { + for i in 0..n { + if token_conflict_map.does_overlap(i, token.index) { + if valid_tokens_mask[i] { + entry.reusable = false; + break; + } + } + } + } + } +} + +fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { + cursor.transition_chars().all(|(chars, is_sep)| { + if is_sep { + true + } else if let CharacterSet::Include(chars) = chars { + chars.iter().all(|c| c.is_alphabetic() || *c == '_') + } else { + false + } + }) +} diff --git a/cli/src/build_tables/token_conflicts.rs b/cli/src/build_tables/token_conflicts.rs new file mode 100644 index 00000000..cb2b6efe --- /dev/null +++ b/cli/src/build_tables/token_conflicts.rs @@ -0,0 +1,382 @@ +use crate::build_tables::item::LookaheadSet; +use crate::grammars::LexicalGrammar; +use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; +use hashbrown::HashSet; +use std::cmp::Ordering; +use std::fmt; + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +struct TokenConflictStatus { + does_overlap: bool, + does_match_valid_continuation: bool, + does_match_separators: bool, + matches_same_string: bool, +} + +pub(crate) struct TokenConflictMap<'a> { + n: usize, + status_matrix: Vec, + starting_chars_by_index: Vec, + following_chars_by_index: Vec, + grammar: &'a LexicalGrammar, +} + +impl<'a> TokenConflictMap<'a> { + pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { + let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); + let starting_chars = get_starting_chars(&mut cursor, grammar); + let following_chars = get_following_chars(&starting_chars, following_tokens); + + let n = grammar.variables.len(); + let mut status_matrix = vec![TokenConflictStatus::default(); n * n]; + for i in 0..grammar.variables.len() { + for j in 0..i { + let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j); + status_matrix[matrix_index(n, i, j)] = status.0; + status_matrix[matrix_index(n, j, i)] = status.1; + } + } + + TokenConflictMap { + n, + status_matrix, + starting_chars_by_index: starting_chars, + following_chars_by_index: following_chars, + grammar, + } + } + + pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool { + let left = &self.status_matrix[matrix_index(self.n, a, other)]; + let right = &self.status_matrix[matrix_index(self.n, b, other)]; + left == right + } + + pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].matches_same_string + } + + pub fn does_conflict(&self, i: usize, j: usize) -> bool { + let entry = &self.status_matrix[matrix_index(self.n, i, j)]; + entry.does_match_valid_continuation || entry.does_match_separators + } + + pub fn does_overlap(&self, i: usize, j: usize) -> bool { + self.status_matrix[matrix_index(self.n, i, j)].does_overlap + } + + pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { + if left.0 > right.0 { + return true; + } else if left.0 < right.0 { + return false; + } + + match grammar.variables[left.1] + .implicit_precedence + .cmp(&grammar.variables[right.1].implicit_precedence) + { + Ordering::Less => false, + Ordering::Greater => true, + Ordering::Equal => left.1 < right.1, + } + } +} + +impl<'a> fmt::Debug for TokenConflictMap<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "TokenConflictMap {{\n")?; + + write!(f, " starting_characters: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?; + } + write!(f, " }},\n")?; + + write!(f, " following_characters: {{\n")?; + for i in 0..self.n { + write!( + f, + " {}: {:?},\n", + self.grammar.variables[i].name, self.following_chars_by_index[i] + )?; + } + write!(f, " }},\n")?; + + write!(f, " status_matrix: {{\n")?; + for i in 0..self.n { + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; + for j in 0..self.n { + write!( + f, + " {}: {:?},\n", + self.grammar.variables[j].name, + self.status_matrix[matrix_index(self.n, i, j)] + )?; + } + write!(f, " }},\n")?; + } + write!(f, " }},")?; + write!(f, "}}")?; + Ok(()) + } +} + +fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize { + variable_count * i + j +} + +fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec { + let mut result = Vec::with_capacity(grammar.variables.len()); + for variable in &grammar.variables { + cursor.reset(vec![variable.start_state]); + let mut all_chars = CharacterSet::empty(); + for (chars, _) in cursor.transition_chars() { + all_chars = all_chars.add(chars); + } + result.push(all_chars); + } + result +} + +fn get_following_chars( + starting_chars: &Vec, + following_tokens: Vec, +) -> Vec { + following_tokens + .into_iter() + .map(|following_tokens| { + let mut chars = CharacterSet::empty(); + for token in following_tokens.iter() { + if token.is_terminal() { + chars = chars.add(&starting_chars[token.index]); + } + } + chars + }) + .collect() +} + +fn compute_conflict_status( + cursor: &mut NfaCursor, + grammar: &LexicalGrammar, + following_chars: &Vec, + i: usize, + j: usize, +) -> (TokenConflictStatus, TokenConflictStatus) { + let mut visited_state_sets = HashSet::new(); + let mut state_set_queue = vec![vec![ + grammar.variables[i].start_state, + grammar.variables[j].start_state, + ]]; + let mut result = ( + TokenConflictStatus::default(), + TokenConflictStatus::default(), + ); + + while let Some(state_set) = state_set_queue.pop() { + // Don't pursue states where there's no potential for conflict. + if variable_ids_for_states(&state_set, grammar).count() > 1 { + cursor.reset(state_set); + } else { + continue; + } + + let mut completion = None; + for (id, precedence) in cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if id == prev_id { + continue; + } + + // Prefer tokens with higher precedence. For tokens with equal precedence, + // prefer those listed earlier in the grammar. + let winning_id; + if TokenConflictMap::prefer_token( + grammar, + (prev_precedence, prev_id), + (precedence, id), + ) { + winning_id = prev_id; + } else { + winning_id = id; + completion = Some((id, precedence)); + } + + if winning_id == i { + result.0.matches_same_string = true; + result.0.does_overlap = true; + } else { + result.1.matches_same_string = true; + result.1.does_overlap = true; + } + } else { + completion = Some((id, precedence)); + } + } + + for NfaTransition { + characters, + precedence, + states, + is_separator, + } in cursor.transitions() + { + let mut can_advance = true; + if let Some((completed_id, completed_precedence)) = completion { + let mut other_id = None; + let mut successor_contains_completed_id = false; + for variable_id in variable_ids_for_states(&states, grammar) { + if variable_id == completed_id { + successor_contains_completed_id = true; + break; + } else { + other_id = Some(variable_id); + } + } + + if let (Some(other_id), false) = (other_id, successor_contains_completed_id) { + let winning_id; + if precedence < completed_precedence { + winning_id = completed_id; + can_advance = false; + } else { + winning_id = other_id; + } + + if winning_id == i { + result.0.does_overlap = true; + if characters.does_intersect(&following_chars[j]) { + result.0.does_match_valid_continuation = true; + } + if is_separator { + result.0.does_match_separators = true; + } + } else { + result.1.does_overlap = true; + if characters.does_intersect(&following_chars[i]) { + result.1.does_match_valid_continuation = true; + } + } + } + } + + if can_advance && visited_state_sets.insert(states.clone()) { + state_set_queue.push(states); + } + } + } + result +} + +fn variable_ids_for_states<'a>( + state_ids: &'a Vec, + grammar: &'a LexicalGrammar, +) -> impl Iterator + 'a { + let mut prev = None; + state_ids.iter().filter_map(move |state_id| { + let variable_id = grammar.variable_index_for_nfa_state(*state_id); + if prev != Some(variable_id) { + prev = Some(variable_id); + prev + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{Variable, VariableType}; + use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}; + use crate::rules::{Rule, Symbol}; + + #[test] + fn test_starting_characters() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "token_0".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("[a-f]1|0x\\d"), + }, + Variable { + name: "token_1".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("d*ef"), + }, + ], + }) + .unwrap(); + + let token_map = TokenConflictMap::new(&grammar, Vec::new()); + + assert_eq!( + token_map.starting_chars_by_index[0], + CharacterSet::empty().add_range('a', 'f').add_char('0') + ); + assert_eq!( + token_map.starting_chars_by_index[1], + CharacterSet::empty().add_range('d', 'e') + ); + } + + #[test] + fn test_token_conflicts() { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: Vec::new(), + variables: vec![ + Variable { + name: "in".to_string(), + kind: VariableType::Named, + rule: Rule::string("in"), + }, + Variable { + name: "identifier".to_string(), + kind: VariableType::Named, + rule: Rule::pattern("\\w+"), + }, + Variable { + name: "instanceof".to_string(), + kind: VariableType::Named, + rule: Rule::string("instanceof"), + }, + ], + }) + .unwrap(); + + let var = |name| index_of_var(&grammar, name); + + let token_map = TokenConflictMap::new( + &grammar, + vec![ + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + ], + ); + + // Given the string "in", the `in` token is preferred over the `identifier` token + assert!(token_map.does_match_same_string(var("in"), var("identifier"))); + assert!(!token_map.does_match_same_string(var("identifier"), var("in"))); + + // Depending on what character follows, the string "in" may be treated as part of an + // `identifier` token. + assert!(token_map.does_conflict(var("identifier"), var("in"))); + + // Depending on what character follows, the string "instanceof" may be treated as part of + // an `identifier` token. + assert!(token_map.does_conflict(var("identifier"), var("instanceof"))); + assert!(token_map.does_conflict(var("instanceof"), var("in"))); + } + + fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { + grammar + .variables + .iter() + .position(|v| v.name == name) + .unwrap() + } +} diff --git a/cli/src/error.rs b/cli/src/error.rs new file mode 100644 index 00000000..9a5801f8 --- /dev/null +++ b/cli/src/error.rs @@ -0,0 +1,24 @@ +#[derive(Debug)] +pub struct Error(pub String); + +pub type Result = std::result::Result; + +impl Error { + pub fn grammar(message: &str) -> Self { + Error(format!("Grammar error: {}", message)) + } + + pub fn regex(message: &str) -> Self { + Error(format!("Regex error: {}", message)) + } + + pub fn undefined_symbol(name: &str) -> Self { + Error(format!("Undefined symbol `{}`", name)) + } +} + +impl From for Error { + fn from(error: serde_json::Error) -> Self { + Error(error.to_string()) + } +} diff --git a/cli/src/generate.rs b/cli/src/generate.rs new file mode 100644 index 00000000..aa8f3b5b --- /dev/null +++ b/cli/src/generate.rs @@ -0,0 +1,34 @@ +use crate::build_tables::build_tables; +use crate::error::Result; +use crate::parse_grammar::parse_grammar; +use crate::prepare_grammar::prepare_grammar; +use crate::render::render_c_code; + +pub fn generate_parser_for_grammar( + input: &str, + minimize: bool, + state_ids_to_log: Vec, +) -> Result { + let input_grammar = parse_grammar(input)?; + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(&input_grammar)?; + let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + &inlines, + minimize, + state_ids_to_log, + )?; + let c_code = render_c_code( + &input_grammar.name, + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + ); + Ok(c_code) +} diff --git a/src/compiler/grammar-schema.json b/cli/src/grammar-schema.json similarity index 100% rename from src/compiler/grammar-schema.json rename to cli/src/grammar-schema.json diff --git a/cli/src/grammars.rs b/cli/src/grammars.rs new file mode 100644 index 00000000..f82d6b02 --- /dev/null +++ b/cli/src/grammars.rs @@ -0,0 +1,204 @@ +use crate::nfa::Nfa; +use crate::rules::{Alias, Associativity, Rule, Symbol}; +use hashbrown::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum VariableType { + Hidden, + Auxiliary, + Anonymous, + Named, +} + +// Input grammar + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct Variable { + pub name: String, + pub kind: VariableType, + pub rule: Rule, +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct InputGrammar { + pub name: String, + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Option, +} + +// Extracted lexical grammar + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexicalVariable { + pub name: String, + pub kind: VariableType, + pub implicit_precedence: i32, + pub start_state: u32, +} + +#[derive(Debug, Default, PartialEq, Eq)] +pub(crate) struct LexicalGrammar { + pub nfa: Nfa, + pub variables: Vec, +} + +// Extracted syntax grammar + +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) struct ProductionStep { + pub symbol: Symbol, + pub precedence: i32, + pub associativity: Option, + pub alias: Option, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct Production { + pub steps: Vec, + pub dynamic_precedence: i32, +} + +pub(crate) struct InlinedProductionMap { + pub productions: Vec, + pub production_map: HashMap<(*const Production, u32), Vec>, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct SyntaxVariable { + pub name: String, + pub kind: VariableType, + pub productions: Vec, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ExternalToken { + pub name: String, + pub kind: VariableType, + pub corresponding_internal_token: Option, +} + +#[derive(Debug)] +pub(crate) struct SyntaxGrammar { + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Option, +} + +#[cfg(test)] +impl ProductionStep { + pub(crate) fn new(symbol: Symbol) -> Self { + Self { + symbol, + precedence: 0, + associativity: None, + alias: None, + } + } + + pub(crate) fn with_prec(self, precedence: i32, associativity: Option) -> Self { + Self { + symbol: self.symbol, + precedence, + associativity, + alias: self.alias, + } + } + + pub(crate) fn with_alias(self, value: &str, is_named: bool) -> Self { + Self { + symbol: self.symbol, + precedence: self.precedence, + associativity: self.associativity, + alias: Some(Alias { + value: value.to_string(), + is_named, + }), + } + } +} + +impl Production { + pub fn first_symbol(&self) -> Option { + self.steps.first().map(|s| s.symbol.clone()) + } +} + +impl Default for Production { + fn default() -> Self { + Production { + dynamic_precedence: 0, + steps: Vec::new(), + } + } +} + +#[cfg(test)] +impl Variable { + pub fn named(name: &str, rule: Rule) -> Self { + Self { + name: name.to_string(), + kind: VariableType::Named, + rule, + } + } + + pub fn auxiliary(name: &str, rule: Rule) -> Self { + Self { + name: name.to_string(), + kind: VariableType::Auxiliary, + rule, + } + } + + pub fn hidden(name: &str, rule: Rule) -> Self { + Self { + name: name.to_string(), + kind: VariableType::Hidden, + rule, + } + } + + pub fn anonymous(name: &str, rule: Rule) -> Self { + Self { + name: name.to_string(), + kind: VariableType::Anonymous, + rule, + } + } +} + +impl LexicalGrammar { + pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize { + self.variables.iter().position(|v| v.start_state >= state_id).unwrap() + } +} + +impl SyntaxVariable { + pub fn is_auxiliary(&self) -> bool { + self.kind == VariableType::Auxiliary + } +} + +impl InlinedProductionMap { + pub fn inlined_productions<'a>( + &'a self, + production: &Production, + step_index: u32, + ) -> Option + 'a> { + self.production_map + .get(&(production as *const Production, step_index)) + .map(|production_indices| { + production_indices + .iter() + .cloned() + .map(move |index| &self.productions[index]) + }) + } +} diff --git a/cli/src/js/dsl.js b/cli/src/js/dsl.js new file mode 100644 index 00000000..ba3962cd --- /dev/null +++ b/cli/src/js/dsl.js @@ -0,0 +1,334 @@ +const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; +const DELIMITER_ESCAPE_PATTERN = /\\\//g; + +function alias(rule, value) { + const result = { + type: "ALIAS", + content: normalize(rule), + named: false, + value: null + }; + + switch (value.constructor) { + case String: + result.named = false; + result.value = value; + return result; + case ReferenceError: + result.named = true; + result.value = value.symbol.name; + return result; + case Object: + if (typeof value.type === 'string' && value.type === 'SYMBOL') { + result.named = true; + result.value = value.name; + return result; + } + } + + throw new Error('Invalid alias value ' + value); +} + +function blank() { + return { + type: "BLANK" + }; +} + +function choice(...elements) { + return { + type: "CHOICE", + members: elements.map(normalize) + }; +} + +function optional(value) { + return choice(value, blank()); +} + +function prec(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC", + value: number, + content: normalize(rule) + }; +} + +prec.left = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_LEFT", + value: number, + content: normalize(rule) + }; +} + +prec.right = function(number, rule) { + if (rule == null) { + rule = number; + number = 0; + } + + return { + type: "PREC_RIGHT", + value: number, + content: normalize(rule) + }; +} + +prec.dynamic = function(number, rule) { + return { + type: "PREC_DYNAMIC", + value: number, + content: normalize(rule) + }; +} + +function repeat(rule) { + return { + type: "REPEAT", + content: normalize(rule) + }; +} + +function repeat1(rule) { + return { + type: "REPEAT1", + content: normalize(rule) + }; +} + +function seq(...elements) { + return { + type: "SEQ", + members: elements.map(normalize) + }; +} + +function sym(name) { + return { + type: "SYMBOL", + name: name + }; +} + +function token(value) { + return { + type: "TOKEN", + content: normalize(value) + }; +} + +token.immediate = function(value) { + return { + type: "IMMEDIATE_TOKEN", + content: normalize(value) + }; +} + +function normalize(value) { + + if (typeof value == "undefined") + throw new Error("Undefined symbol"); + + switch (value.constructor) { + case String: + return { + type: 'STRING', + value + }; + case RegExp: + return { + type: 'PATTERN', + value: value.source + .replace( + DELIMITER_ESCAPE_PATTERN, + '/' + ) + .replace( + UNICODE_ESCAPE_PATTERN, + (match, group) => String.fromCharCode(parseInt(group, 16)) + ) + }; + case ReferenceError: + throw value + default: + if (typeof value.type === 'string') { + return value; + } else { + throw new TypeError("Invalid rule: " + value.toString()); + } + } +} + +function RuleBuilder(ruleMap) { + return new Proxy({}, { + get(target, propertyName) { + const symbol = { + type: 'SYMBOL', + name: propertyName + }; + + if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) { + return symbol; + } else { + const error = new ReferenceError(`Undefined symbol '${propertyName}'`); + error.symbol = symbol; + return error; + } + } + }) +} + +function grammar(baseGrammar, options) { + if (!options) { + options = baseGrammar; + baseGrammar = { + name: null, + rules: {}, + extras: [normalize(/\s/)], + conflicts: [], + externals: [], + inline: [] + }; + } + + let externals = baseGrammar.externals; + if (options.externals) { + if (typeof options.externals !== "function") { + throw new Error("Grammar's 'externals' property must be a function."); + } + + const externalsRuleBuilder = RuleBuilder(null) + const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals); + + if (!Array.isArray(externalRules)) { + throw new Error("Grammar's 'externals' property must return an array of rules."); + } + + externals = externalRules.map(normalize); + } + + const ruleMap = {}; + for (const key in options.rules) { + ruleMap[key] = true; + } + for (const key in baseGrammar.rules) { + ruleMap[key] = true; + } + for (const external of externals) { + if (typeof external.name === 'string') { + ruleMap[external.name] = true; + } + } + + const ruleBuilder = RuleBuilder(ruleMap); + + const name = options.name; + if (typeof name !== "string") { + throw new Error("Grammar's 'name' property must be a string."); + } + + if (!/^[a-zA-Z_]\w*$/.test(name)) { + throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters."); + } + + let rules = Object.assign({}, baseGrammar.rules); + if (options.rules) { + if (typeof options.rules !== "object") { + throw new Error("Grammar's 'rules' property must be an object."); + } + + for (const ruleName in options.rules) { + const ruleFn = options.rules[ruleName]; + if (typeof ruleFn !== "function") { + throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not."); + } + rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName])); + } + } + + let extras = baseGrammar.extras.slice(); + if (options.extras) { + if (typeof options.extras !== "function") { + throw new Error("Grammar's 'extras' property must be a function."); + } + + extras = options.extras + .call(ruleBuilder, ruleBuilder, baseGrammar.extras) + .map(normalize); + } + + let word = baseGrammar.word; + if (options.word) { + word = options.word.call(ruleBuilder, ruleBuilder).name; + if (typeof word != 'string') { + throw new Error("Grammar's 'word' property must be a named rule."); + } + } + + let conflicts = baseGrammar.conflicts; + if (options.conflicts) { + if (typeof options.conflicts !== "function") { + throw new Error("Grammar's 'conflicts' property must be a function."); + } + + const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym)); + const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules); + + if (!Array.isArray(conflictRules)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + conflicts = conflictRules.map(conflictSet => { + if (!Array.isArray(conflictSet)) { + throw new Error("Grammar's conflicts must be an array of arrays of rules."); + } + + return conflictSet.map(symbol => symbol.name); + }); + } + + let inline = baseGrammar.inline; + if (options.inline) { + if (typeof options.inline !== "function") { + throw new Error("Grammar's 'inline' property must be a function."); + } + + const baseInlineRules = baseGrammar.inline.map(sym); + const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules); + + if (!Array.isArray(inlineRules)) { + throw new Error("Grammar's inline must be an array of rules."); + } + + inline = inlineRules.map(symbol => symbol.name); + } + + if (Object.keys(rules).length == 0) { + throw new Error("Grammar must have at least one rule."); + } + + return {name, word, rules, extras, conflicts, externals, inline}; + } + +global.alias = alias; +global.blank = blank; +global.choice = choice; +global.optional = optional; +global.prec = prec; +global.repeat = repeat; +global.repeat1 = repeat1; +global.seq = seq; +global.sym = sym; +global.token = token; +global.grammar = grammar; diff --git a/cli/src/logger.rs b/cli/src/logger.rs new file mode 100644 index 00000000..18df763d --- /dev/null +++ b/cli/src/logger.rs @@ -0,0 +1,29 @@ +use log::{LevelFilter, Log, Metadata, Record}; + +struct Logger { + pub filter: Option, +} + +impl Log for Logger { + fn enabled(&self, _: &Metadata) -> bool { + true + } + + fn log(&self, record: &Record) { + eprintln!( + "[{}] {}", + record + .module_path() + .unwrap_or_default() + .trim_start_matches("rust_tree_sitter_cli::"), + record.args() + ); + } + + fn flush(&self) {} +} + +pub(crate) fn init() { + log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap(); + log::set_max_level(LevelFilter::Info); +} diff --git a/cli/src/main.rs b/cli/src/main.rs new file mode 100644 index 00000000..11c277c3 --- /dev/null +++ b/cli/src/main.rs @@ -0,0 +1,119 @@ +#[macro_use] +extern crate lazy_static; +#[macro_use] +extern crate log; +#[macro_use] +extern crate serde_derive; +extern crate hashbrown; +extern crate serde_json; + +use clap::{App, Arg, SubCommand}; +use std::env; +use std::io::Write; +use std::path::PathBuf; +use std::process::{exit, Command, Stdio}; +use std::usize; + +mod build_tables; +mod error; +mod generate; +mod grammars; +mod logger; +mod nfa; +mod parse_grammar; +mod prepare_grammar; +mod render; +mod rules; +mod tables; + +fn main() { + if let Err(e) = run() { + eprintln!("{}", e.0); + exit(1); + } +} + +fn run() -> error::Result<()> { + let matches = App::new("tree-sitter") + .version("0.1") + .author("Max Brunsfeld ") + .about("Generates and tests parsers") + .subcommand( + SubCommand::with_name("generate") + .about("Generate a parser") + .arg(Arg::with_name("log").long("log")) + .arg( + Arg::with_name("state-ids-to-log") + .long("log-state") + .takes_value(true), + ) + .arg(Arg::with_name("no-minimize").long("no-minimize")), + ) + .subcommand( + SubCommand::with_name("parse") + .about("Parse a file") + .arg(Arg::with_name("path").index(1)), + ) + .subcommand( + SubCommand::with_name("test") + .about("Run a parser's tests") + .arg(Arg::with_name("path").index(1).required(true)) + .arg(Arg::with_name("line").index(2).required(true)) + .arg(Arg::with_name("column").index(3).required(true)), + ) + .get_matches(); + + if let Some(matches) = matches.subcommand_matches("generate") { + if matches.is_present("log") { + logger::init(); + } + + let minimize = !matches.is_present("no-minimize"); + let state_ids_to_log = matches + .values_of("state-ids-to-log") + .map_or(Vec::new(), |ids| { + ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) + .collect() + }); + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); + grammar_path.push("grammar.js"); + let grammar_json = load_js_grammar_file(grammar_path); + let code = + generate::generate_parser_for_grammar(&grammar_json, minimize, state_ids_to_log)?; + println!("{}", code); + } + + Ok(()) +} + +fn load_js_grammar_file(grammar_path: PathBuf) -> String { + let mut node_process = Command::new("node") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .expect("Failed to run `node`"); + + let js_prelude = include_str!("./js/dsl.js"); + let mut node_stdin = node_process + .stdin + .take() + .expect("Failed to open stdin for node"); + write!( + node_stdin, + "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", + js_prelude, + grammar_path.to_str().unwrap() + ) + .expect("Failed to write to node's stdin"); + drop(node_stdin); + let output = node_process + .wait_with_output() + .expect("Failed to read output from node"); + match output.status.code() { + None => panic!("Node process was killed"), + Some(0) => {} + Some(code) => panic!(format!("Node process exited with status {}", code)), + } + + String::from_utf8(output.stdout).expect("Got invalid UTF8 from node") +} diff --git a/cli/src/nfa.rs b/cli/src/nfa.rs new file mode 100644 index 00000000..54e34814 --- /dev/null +++ b/cli/src/nfa.rs @@ -0,0 +1,771 @@ +use std::char; +use std::cmp::max; +use std::cmp::Ordering; +use std::fmt; +use std::mem::swap; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum CharacterSet { + Include(Vec), + Exclude(Vec), +} + +#[derive(Debug, PartialEq, Eq)] +pub enum NfaState { + Advance { + chars: CharacterSet, + state_id: u32, + is_sep: bool, + precedence: i32, + }, + Split(u32, u32), + Accept { + variable_index: usize, + precedence: i32, + }, +} + +#[derive(PartialEq, Eq)] +pub struct Nfa { + pub states: Vec, +} + +#[derive(Debug)] +pub struct NfaCursor<'a> { + pub(crate) state_ids: Vec, + nfa: &'a Nfa, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct NfaTransition { + pub characters: CharacterSet, + pub is_separator: bool, + pub precedence: i32, + pub states: Vec, +} + +impl Default for Nfa { + fn default() -> Self { + Self { states: Vec::new() } + } +} + +impl CharacterSet { + pub fn empty() -> Self { + CharacterSet::Include(Vec::new()) + } + + pub fn all() -> Self { + CharacterSet::Exclude(Vec::new()) + } + + pub fn negate(self) -> CharacterSet { + match self { + CharacterSet::Include(chars) => CharacterSet::Exclude(chars), + CharacterSet::Exclude(chars) => CharacterSet::Include(chars), + } + } + + pub fn add_char(self, c: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + if let Err(i) = chars.binary_search(&c) { + chars.insert(i, c); + } + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add_range(self, start: char, end: char) -> Self { + if let CharacterSet::Include(mut chars) = self { + let mut c = start as u32; + while c <= end as u32 { + chars.push(char::from_u32(c).unwrap()); + c += 1; + } + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } else { + panic!("Called add with a negated character set"); + } + } + + pub fn add(self, other: &CharacterSet) -> Self { + match self { + CharacterSet::Include(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } + CharacterSet::Exclude(other_chars) => { + let excluded_chars = other_chars + .iter() + .cloned() + .filter(|c| !chars.contains(&c)) + .collect(); + CharacterSet::Exclude(excluded_chars) + } + }, + CharacterSet::Exclude(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.retain(|c| !other_chars.contains(&c)); + CharacterSet::Exclude(chars) + } + CharacterSet::Exclude(other_chars) => { + chars.retain(|c| other_chars.contains(&c)); + CharacterSet::Exclude(chars) + } + }, + } + } + + pub fn does_intersect(&self, other: &CharacterSet) -> bool { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common, + CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only, + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only, + CharacterSet::Exclude(_) => true, + }, + } + } + + pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet { + match self { + CharacterSet::Include(chars) => match other { + CharacterSet::Include(other_chars) => { + CharacterSet::Include(remove_chars(chars, other_chars, true)) + } + CharacterSet::Exclude(other_chars) => { + let mut removed = remove_chars(chars, other_chars, false); + add_chars(other_chars, chars); + swap(&mut removed, chars); + CharacterSet::Include(removed) + } + }, + CharacterSet::Exclude(chars) => match other { + CharacterSet::Include(other_chars) => { + let mut removed = remove_chars(other_chars, chars, false); + add_chars(chars, other_chars); + swap(&mut removed, other_chars); + CharacterSet::Include(removed) + } + CharacterSet::Exclude(other_chars) => { + let mut result_exclusion = chars.clone(); + result_exclusion.extend(other_chars.iter().cloned()); + result_exclusion.sort_unstable(); + result_exclusion.dedup(); + remove_chars(chars, other_chars, true); + let mut included_characters = Vec::new(); + let mut other_included_characters = Vec::new(); + swap(&mut included_characters, other_chars); + swap(&mut other_included_characters, chars); + *self = CharacterSet::Include(included_characters); + *other = CharacterSet::Include(other_included_characters); + CharacterSet::Exclude(result_exclusion) + } + }, + } + } + + pub fn is_empty(&self) -> bool { + if let CharacterSet::Include(c) = self { + c.is_empty() + } else { + false + } + } + + pub fn contains(&self, c: char) -> bool { + match self { + CharacterSet::Include(chars) => chars.contains(&c), + CharacterSet::Exclude(chars) => !chars.contains(&c), + } + } +} + +impl Ord for CharacterSet { + fn cmp(&self, other: &CharacterSet) -> Ordering { + match self { + CharacterSet::Include(chars) => { + if let CharacterSet::Include(other_chars) = other { + order_chars(chars, other_chars) + } else { + Ordering::Less + } + } + CharacterSet::Exclude(chars) => { + if let CharacterSet::Exclude(other_chars) = other { + order_chars(chars, other_chars) + } else { + Ordering::Greater + } + } + } + } +} + +impl PartialOrd for CharacterSet { + fn partial_cmp(&self, other: &CharacterSet) -> Option { + Some(self.cmp(other)) + } +} + +fn add_chars(left: &mut Vec, right: &Vec) { + for c in right { + match left.binary_search(c) { + Err(i) => left.insert(i, *c), + _ => {} + } + } +} + +fn remove_chars(left: &mut Vec, right: &mut Vec, mutate_right: bool) -> Vec { + let mut result = Vec::new(); + right.retain(|right_char| { + if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) { + left.remove(index); + result.push(*right_char); + false || !mutate_right + } else { + true + } + }); + result +} + +struct SetComparision { + left_only: bool, + common: bool, + right_only: bool, +} + +fn compare_chars(left: &Vec, right: &Vec) -> SetComparision { + let mut result = SetComparision { + left_only: false, + common: false, + right_only: false, + }; + let mut left = left.iter().cloned(); + let mut right = right.iter().cloned(); + let mut i = left.next(); + let mut j = right.next(); + while let (Some(left_char), Some(right_char)) = (i, j) { + if left_char < right_char { + i = left.next(); + result.left_only = true; + } else if left_char > right_char { + j = right.next(); + result.right_only = true; + } else { + i = left.next(); + j = right.next(); + result.common = true; + } + } + result +} + +fn order_chars(chars: &Vec, other_chars: &Vec) -> Ordering { + if chars.is_empty() { + if other_chars.is_empty() { + Ordering::Equal + } else { + Ordering::Less + } + } else if other_chars.is_empty() { + Ordering::Greater + } else { + let cmp = chars.len().cmp(&other_chars.len()); + if cmp != Ordering::Equal { + return cmp; + } + for (c, other_c) in chars.iter().zip(other_chars.iter()) { + let cmp = c.cmp(other_c); + if cmp != Ordering::Equal { + return cmp; + } + } + Ordering::Equal + } +} + +impl Nfa { + pub fn new() -> Self { + Nfa { states: Vec::new() } + } + + pub fn last_state_id(&self) -> u32 { + self.states.len() as u32 - 1 + } +} + +impl fmt::Debug for Nfa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Nfa {{ states: {{\n")?; + for (i, state) in self.states.iter().enumerate() { + write!(f, " {}: {:?},\n", i, state)?; + } + write!(f, "}} }}")?; + Ok(()) + } +} + +impl<'a> NfaCursor<'a> { + pub fn new(nfa: &'a Nfa, mut states: Vec) -> Self { + let mut result = Self { + nfa, + state_ids: Vec::new(), + }; + result.add_states(&mut states); + result + } + + pub fn reset(&mut self, mut states: Vec) { + self.state_ids.clear(); + self.add_states(&mut states); + } + + pub fn force_reset(&mut self, states: Vec) { + self.state_ids = states + } + + pub fn transition_chars(&self) -> impl Iterator { + self.raw_transitions().map(|t| (t.0, t.1)) + } + + pub fn transitions(&self) -> Vec { + Self::group_transitions(self.raw_transitions()) + } + + fn raw_transitions(&self) -> impl Iterator { + self.state_ids.iter().filter_map(move |id| { + if let NfaState::Advance { + chars, + state_id, + precedence, + is_sep, + } = &self.nfa.states[*id as usize] + { + Some((chars, *is_sep, *precedence, *state_id)) + } else { + None + } + }) + } + + fn group_transitions<'b>( + iter: impl Iterator, + ) -> Vec { + let mut result: Vec = Vec::new(); + for (chars, is_sep, prec, state) in iter { + let mut chars = chars.clone(); + let mut i = 0; + while i < result.len() && !chars.is_empty() { + let intersection = result[i].characters.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut intersection_states = result[i].states.clone(); + match intersection_states.binary_search(&state) { + Err(j) => intersection_states.insert(j, state), + _ => {} + } + let intersection_transition = NfaTransition { + characters: intersection, + is_separator: result[i].is_separator || is_sep, + precedence: max(result[i].precedence, prec), + states: intersection_states, + }; + if result[i].characters.is_empty() { + result[i] = intersection_transition; + } else { + result.insert(i, intersection_transition); + i += 1; + } + } + i += 1; + } + if !chars.is_empty() { + result.push(NfaTransition { + characters: chars, + precedence: prec, + states: vec![state], + is_separator: is_sep, + }); + } + } + result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters)); + result + } + + pub fn completions(&self) -> impl Iterator + '_ { + self.state_ids.iter().filter_map(move |state_id| { + if let NfaState::Accept { + variable_index, + precedence, + } = self.nfa.states[*state_id as usize] + { + Some((variable_index, precedence)) + } else { + None + } + }) + } + + pub fn add_states(&mut self, new_state_ids: &mut Vec) { + let mut i = 0; + while i < new_state_ids.len() { + let state_id = new_state_ids[i]; + let state = &self.nfa.states[state_id as usize]; + if let NfaState::Split(left, right) = state { + let mut has_left = false; + let mut has_right = false; + for new_state_id in new_state_ids.iter() { + if *new_state_id == *left { + has_left = true; + } + if *new_state_id == *right { + has_right = true; + } + } + if !has_left { + new_state_ids.push(*left); + } + if !has_right { + new_state_ids.push(*right); + } + } else if let Err(i) = self.state_ids.binary_search(&state_id) { + self.state_ids.insert(i, state_id); + } + i += 1; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_group_transitions() { + let table = [ + // overlapping character classes + ( + vec![ + (CharacterSet::empty().add_range('a', 'f'), false, 0, 1), + (CharacterSet::empty().add_range('d', 'i'), false, 1, 2), + ], + vec![ + NfaTransition { + characters: CharacterSet::empty().add_range('a', 'c'), + is_separator: false, + precedence: 0, + states: vec![1], + }, + NfaTransition { + characters: CharacterSet::empty().add_range('d', 'f'), + is_separator: false, + precedence: 1, + states: vec![1, 2], + }, + NfaTransition { + characters: CharacterSet::empty().add_range('g', 'i'), + is_separator: false, + precedence: 1, + states: vec![2], + }, + ], + ), + // large character class followed by many individual characters + ( + vec![ + (CharacterSet::empty().add_range('a', 'z'), false, 0, 1), + (CharacterSet::empty().add_char('d'), false, 0, 2), + (CharacterSet::empty().add_char('i'), false, 0, 3), + (CharacterSet::empty().add_char('f'), false, 0, 4), + ], + vec![ + NfaTransition { + characters: CharacterSet::empty().add_char('d'), + is_separator: false, + precedence: 0, + states: vec![1, 2], + }, + NfaTransition { + characters: CharacterSet::empty().add_char('f'), + is_separator: false, + precedence: 0, + states: vec![1, 4], + }, + NfaTransition { + characters: CharacterSet::empty().add_char('i'), + is_separator: false, + precedence: 0, + states: vec![1, 3], + }, + NfaTransition { + characters: CharacterSet::empty() + .add_range('a', 'c') + .add_char('e') + .add_range('g', 'h') + .add_range('j', 'z'), + is_separator: false, + precedence: 0, + states: vec![1], + }, + ], + ), + // negated character class followed by an individual character + ( + vec![ + (CharacterSet::empty().add_char('0'), false, 0, 1), + (CharacterSet::empty().add_char('b'), false, 0, 2), + ( + CharacterSet::empty().add_range('a', 'f').negate(), + false, + 0, + 3, + ), + (CharacterSet::empty().add_char('c'), false, 0, 4), + ], + vec![ + NfaTransition { + characters: CharacterSet::empty().add_char('0'), + precedence: 0, + states: vec![1, 3], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty().add_char('b'), + precedence: 0, + states: vec![2], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty().add_char('c'), + precedence: 0, + states: vec![4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::empty() + .add_range('a', 'f') + .add_char('0') + .negate(), + precedence: 0, + states: vec![3], + is_separator: false, + }, + ], + ), + // multiple negated character classes + ( + vec![ + (CharacterSet::Include(vec!['a']), false, 0, 1), + (CharacterSet::Exclude(vec!['a', 'b', 'c']), false, 0, 2), + (CharacterSet::Include(vec!['g']), false, 0, 6), + (CharacterSet::Exclude(vec!['d', 'e', 'f']), false, 0, 3), + (CharacterSet::Exclude(vec!['g', 'h', 'i']), false, 0, 4), + (CharacterSet::Include(vec!['g']), false, 0, 5), + ], + vec![ + NfaTransition { + characters: CharacterSet::Include(vec!['a']), + precedence: 0, + states: vec![1, 3, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['g']), + precedence: 0, + states: vec![2, 3, 5, 6], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['b', 'c']), + precedence: 0, + states: vec![3, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['h', 'i']), + precedence: 0, + states: vec![2, 3], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Include(vec!['d', 'e', 'f']), + precedence: 0, + states: vec![2, 4], + is_separator: false, + }, + NfaTransition { + characters: CharacterSet::Exclude(vec![ + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', + ]), + precedence: 0, + states: vec![2, 3, 4], + is_separator: false, + }, + ], + ), + ]; + + for row in table.iter() { + assert_eq!( + NfaCursor::group_transitions(row.0.iter().map(|(c, sep, p, s)| (c, *sep, *p, *s))), + row.1 + ); + } + } + + #[test] + fn test_character_set_remove_intersection() { + // A whitelist and an overlapping whitelist. + // Both sets contain 'c', 'd', and 'f' + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + let mut a = CharacterSet::empty().add_range('a', 'f'); + let mut b = CharacterSet::empty().add_range('c', 'h'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_range('c', 'f') + ); + assert_eq!(a, CharacterSet::empty().add_range('a', 'b')); + assert_eq!(b, CharacterSet::empty().add_range('g', 'h')); + + // A whitelist and a larger whitelist. + let mut a = CharacterSet::empty().add_char('c'); + let mut b = CharacterSet::empty().add_range('a', 'e'); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_char('c') + ); + assert_eq!(a, CharacterSet::empty()); + assert_eq!( + b, + CharacterSet::empty() + .add_range('a', 'b') + .add_range('d', 'e') + ); + + let mut a = CharacterSet::empty().add_char('c'); + let mut b = CharacterSet::empty().add_range('a', 'e'); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::empty().add_char('c') + ); + assert_eq!(a, CharacterSet::empty()); + assert_eq!( + b, + CharacterSet::empty() + .add_range('a', 'b') + .add_range('d', 'e') + ); + + // A whitelist and an intersecting blacklist. + // Both sets contain 'e', 'f', and 'm' + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + let mut a = CharacterSet::empty() + .add_range('c', 'h') + .add_range('k', 'm'); + let mut b = CharacterSet::empty() + .add_range('a', 'd') + .add_range('g', 'l') + .negate(); + assert_eq!( + b.remove_intersection(&mut a), + CharacterSet::Include(vec!['e', 'f', 'm']) + ); + assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l'])); + assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate()); + + // A blacklist and an overlapping blacklist. + // Both sets exclude 'c', 'd', and 'e' + let mut a = CharacterSet::empty().add_range('a', 'e').negate(); + let mut b = CharacterSet::empty().add_range('c', 'h').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('a', 'h').negate(), + ); + assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h'])); + assert_eq!(b, CharacterSet::Include(vec!['a', 'b'])); + + // A blacklist and a larger blacklist. + let mut a = CharacterSet::empty().add_range('b', 'c').negate(); + let mut b = CharacterSet::empty().add_range('a', 'd').negate(); + assert_eq!( + a.remove_intersection(&mut b), + CharacterSet::empty().add_range('a', 'd').negate(), + ); + assert_eq!(a, CharacterSet::empty().add_char('a').add_char('d')); + assert_eq!(b, CharacterSet::empty()); + } + + #[test] + fn test_character_set_does_intersect() { + let (a, b) = (CharacterSet::empty(), CharacterSet::empty()); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::empty().add_char('a'), + CharacterSet::empty().add_char('a'), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::empty().add_char('b'), + CharacterSet::empty().add_char('a').add_char('c'), + ); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Include(vec!['b']), + CharacterSet::Exclude(vec!['a', 'b', 'c']), + ); + assert!(!a.does_intersect(&b)); + assert!(!b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Include(vec!['b']), + CharacterSet::Exclude(vec!['a', 'c']), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + + let (a, b) = ( + CharacterSet::Exclude(vec!['a']), + CharacterSet::Exclude(vec!['a']), + ); + assert!(a.does_intersect(&b)); + assert!(b.does_intersect(&a)); + } +} diff --git a/cli/src/parse_grammar.rs b/cli/src/parse_grammar.rs new file mode 100644 index 00000000..6808f402 --- /dev/null +++ b/cli/src/parse_grammar.rs @@ -0,0 +1,167 @@ +use serde_json::{Map, Value}; +use crate::error::Result; +use crate::grammars::{InputGrammar, Variable, VariableType}; +use crate::rules::Rule; + +#[derive(Deserialize)] +#[serde(tag = "type")] +#[allow(non_camel_case_types)] +enum RuleJSON { + ALIAS { + content: Box, + named: bool, + value: String, + }, + BLANK, + STRING { + value: String, + }, + PATTERN { + value: String, + }, + SYMBOL { + name: String, + }, + CHOICE { + members: Vec, + }, + SEQ { + members: Vec, + }, + REPEAT { + content: Box, + }, + REPEAT1 { + content: Box, + }, + PREC_DYNAMIC { + value: i32, + content: Box, + }, + PREC_LEFT { + value: i32, + content: Box, + }, + PREC_RIGHT { + value: i32, + content: Box, + }, + PREC { + value: i32, + content: Box, + }, + TOKEN { + content: Box, + }, + IMMEDIATE_TOKEN { + content: Box, + }, +} + +#[derive(Deserialize)] +struct GrammarJSON { + name: String, + rules: Map, + conflicts: Option>>, + externals: Option>, + extras: Option>, + inline: Option>, + word: Option, +} + +pub(crate) fn parse_grammar(input: &str) -> Result { + let grammar_json: GrammarJSON = serde_json::from_str(&input)?; + + let mut variables = Vec::with_capacity(grammar_json.rules.len()); + for (name, value) in grammar_json.rules { + variables.push(Variable { + name: name.to_owned(), + kind: VariableType::Named, + rule: parse_rule(serde_json::from_value(value)?), + }) + } + + let extra_tokens = grammar_json.extras + .unwrap_or(Vec::new()) + .into_iter() + .map(parse_rule) + .collect(); + let external_tokens = grammar_json.externals + .unwrap_or(Vec::new()) + .into_iter() + .map(parse_rule) + .collect(); + let expected_conflicts = grammar_json.conflicts + .unwrap_or(Vec::new()); + let variables_to_inline = grammar_json.inline + .unwrap_or(Vec::new()); + + Ok(InputGrammar { + name: grammar_json.name, + word_token: grammar_json.word, + variables, + extra_tokens, + expected_conflicts, + external_tokens, + variables_to_inline, + }) +} + +fn parse_rule(json: RuleJSON) -> Rule { + match json { + RuleJSON::ALIAS { content, value, named } => Rule::alias(parse_rule(*content), value, named), + RuleJSON::BLANK => Rule::Blank, + RuleJSON::STRING { value } => Rule::String(value), + RuleJSON::PATTERN { value } => Rule::Pattern(value), + RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name), + RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()), + RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()), + RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)), + RuleJSON::REPEAT { content } => Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]), + RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)), + RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)), + RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)), + RuleJSON::PREC_DYNAMIC { value, content } => Rule::prec_dynamic(value, parse_rule(*content)), + RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)), + RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_grammar() { + let grammar = parse_grammar(r#"{ + "name": "my_lang", + "rules": { + "file": { + "type": "REPEAT1", + "content": { + "type": "SYMBOL", + "name": "statement" + } + }, + "statement": { + "type": "STRING", + "value": "foo" + } + } + }"#).unwrap(); + + assert_eq!(grammar.name, "my_lang"); + assert_eq!(grammar.variables, vec![ + Variable { + name: "file".to_string(), + kind: VariableType::Named, + rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) + }, + Variable { + name: "statement".to_string(), + kind: VariableType::Named, + rule: Rule::String("foo".to_string()) + }, + ]); + } +} diff --git a/cli/src/prepare_grammar/expand_repeats.rs b/cli/src/prepare_grammar/expand_repeats.rs new file mode 100644 index 00000000..4589bd11 --- /dev/null +++ b/cli/src/prepare_grammar/expand_repeats.rs @@ -0,0 +1,241 @@ +use super::ExtractedSyntaxGrammar; +use crate::grammars::{Variable, VariableType}; +use crate::rules::{Rule, Symbol}; +use hashbrown::HashMap; +use std::mem; + +struct Expander { + variable_name: String, + repeat_count_in_variable: usize, + preceding_symbol_count: usize, + auxiliary_variables: Vec, + existing_repeats: HashMap, +} + +impl Expander { + fn expand_variable(&mut self, variable: &mut Variable) { + self.variable_name.clear(); + self.variable_name.push_str(&variable.name); + self.repeat_count_in_variable = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.expand_rule(&rule); + } + + fn expand_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Choice(elements) => Rule::Choice( + elements + .iter() + .map(|element| self.expand_rule(element)) + .collect(), + ), + + Rule::Seq(elements) => Rule::Seq( + elements + .iter() + .map(|element| self.expand_rule(element)) + .collect(), + ), + + Rule::Repeat(content) => { + let inner_rule = self.expand_rule(content); + + if let Some(existing_symbol) = self.existing_repeats.get(&inner_rule) { + return Rule::Symbol(*existing_symbol); + } + + self.repeat_count_in_variable += 1; + let rule_name = format!( + "{}_repeat{}", + self.variable_name, self.repeat_count_in_variable + ); + let repeat_symbol = Symbol::non_terminal( + self.preceding_symbol_count + self.auxiliary_variables.len(), + ); + self.existing_repeats + .insert(inner_rule.clone(), repeat_symbol); + self.auxiliary_variables.push(Variable { + name: rule_name, + kind: VariableType::Auxiliary, + rule: Rule::Choice(vec![ + Rule::Seq(vec![ + Rule::Symbol(repeat_symbol), + Rule::Symbol(repeat_symbol), + ]), + inner_rule, + ]), + }); + + Rule::Symbol(repeat_symbol) + } + + Rule::Metadata { rule, params } => Rule::Metadata { + rule: Box::new(self.expand_rule(rule)), + params: params.clone(), + }, + + _ => rule.clone(), + } + } +} + +pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSyntaxGrammar { + let mut expander = Expander { + variable_name: String::new(), + repeat_count_in_variable: 0, + preceding_symbol_count: grammar.variables.len(), + auxiliary_variables: Vec::new(), + existing_repeats: HashMap::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + expander.expand_variable(&mut variable); + } + + grammar + .variables + .extend(expander.auxiliary_variables.into_iter()); + grammar +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_repeat_expansion() { + // Repeats nested inside of sequences and choices are expanded. + let grammar = expand_repeats(build_grammar(vec![Variable::named( + "rule0", + Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![ + Rule::repeat(Rule::terminal(11)), + Rule::repeat(Rule::terminal(12)), + ]), + Rule::terminal(13), + ]), + )])); + + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::seq(vec![ + Rule::terminal(10), + Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]), + Rule::terminal(13), + ]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]), + Rule::terminal(11), + ]) + ), + Variable::auxiliary( + "rule0_repeat2", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::terminal(12), + ]) + ), + ] + ); + } + + #[test] + fn test_repeat_deduplication() { + // Terminal 4 appears inside of a repeat in three different places. + let grammar = expand_repeats(build_grammar(vec![ + Variable::named( + "rule0", + Rule::choice(vec![ + Rule::seq(vec![Rule::terminal(1), Rule::repeat(Rule::terminal(4))]), + Rule::seq(vec![Rule::terminal(2), Rule::repeat(Rule::terminal(4))]), + ]), + ), + Variable::named( + "rule1", + Rule::seq(vec![Rule::terminal(3), Rule::repeat(Rule::terminal(4))]), + ), + ])); + + // Only one auxiliary rule is created for repeating terminal 4. + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::choice(vec![ + Rule::seq(vec![Rule::terminal(1), Rule::non_terminal(2)]), + Rule::seq(vec![Rule::terminal(2), Rule::non_terminal(2)]), + ]) + ), + Variable::named( + "rule1", + Rule::seq(vec![Rule::terminal(3), Rule::non_terminal(2),]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::terminal(4), + ]) + ) + ] + ); + } + + #[test] + fn test_expansion_of_nested_repeats() { + let grammar = expand_repeats(build_grammar(vec![Variable::named( + "rule0", + Rule::seq(vec![ + Rule::terminal(10), + Rule::repeat(Rule::seq(vec![ + Rule::terminal(11), + Rule::repeat(Rule::terminal(12)), + ])), + ]), + )])); + + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "rule0", + Rule::seq(vec![Rule::terminal(10), Rule::non_terminal(2),]) + ), + Variable::auxiliary( + "rule0_repeat1", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]), + Rule::terminal(12), + ]) + ), + Variable::auxiliary( + "rule0_repeat2", + Rule::choice(vec![ + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]), + Rule::seq(vec![Rule::terminal(11), Rule::non_terminal(1),]), + ]) + ), + ] + ); + } + + fn build_grammar(variables: Vec) -> ExtractedSyntaxGrammar { + ExtractedSyntaxGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/cli/src/prepare_grammar/expand_tokens.rs b/cli/src/prepare_grammar/expand_tokens.rs new file mode 100644 index 00000000..2678df19 --- /dev/null +++ b/cli/src/prepare_grammar/expand_tokens.rs @@ -0,0 +1,611 @@ +use super::ExtractedLexicalGrammar; +use crate::error::{Error, Result}; +use crate::grammars::{LexicalGrammar, LexicalVariable}; +use crate::nfa::{CharacterSet, Nfa, NfaState}; +use crate::rules::Rule; +use regex_syntax::ast::{ + parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, +}; +use std::i32; + +struct NfaBuilder { + nfa: Nfa, + is_sep: bool, + precedence_stack: Vec, +} + +fn get_implicit_precedence(rule: &Rule) -> i32 { + match rule { + Rule::String(_) => 1, + Rule::Metadata { rule, params } => { + if params.is_main_token { + get_implicit_precedence(rule) + 2 + } else { + get_implicit_precedence(rule) + } + } + _ => 0, + } +} + +fn get_completion_precedence(rule: &Rule) -> i32 { + match rule { + Rule::Metadata { params, .. } => params.precedence.unwrap_or(0), + _ => 0, + } +} + +pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result { + let mut builder = NfaBuilder { + nfa: Nfa::new(), + is_sep: true, + precedence_stack: vec![0], + }; + + let separator_rule = if grammar.separators.len() > 0 { + grammar.separators.push(Rule::Blank); + Rule::repeat(Rule::choice(grammar.separators)) + } else { + Rule::Blank + }; + + let mut variables = Vec::new(); + for (i, variable) in grammar.variables.into_iter().enumerate() { + let is_immediate_token = match &variable.rule { + Rule::Metadata { params, .. } => params.is_main_token, + _ => false, + }; + + builder.is_sep = false; + builder.nfa.states.push(NfaState::Accept { + variable_index: i, + precedence: get_completion_precedence(&variable.rule), + }); + let last_state_id = builder.nfa.last_state_id(); + builder + .expand_rule(&variable.rule, last_state_id) + .map_err(|Error(msg)| Error(format!("Rule {} {}", variable.name, msg)))?; + + if !is_immediate_token { + builder.is_sep = true; + let last_state_id = builder.nfa.last_state_id(); + builder.expand_rule(&separator_rule, last_state_id)?; + } + + variables.push(LexicalVariable { + name: variable.name, + kind: variable.kind, + implicit_precedence: get_implicit_precedence(&variable.rule), + start_state: builder.nfa.last_state_id(), + }); + } + + Ok(LexicalGrammar { + nfa: builder.nfa, + variables, + }) +} + +impl NfaBuilder { + fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result { + match rule { + Rule::Pattern(s) => { + let ast = parse::Parser::new() + .parse(&s) + .map_err(|e| Error(e.to_string()))?; + self.expand_regex(&ast, next_state_id) + } + Rule::String(s) => { + for c in s.chars().rev() { + self.push_advance(CharacterSet::empty().add_char(c), next_state_id); + next_state_id = self.nfa.last_state_id(); + } + Ok(s.len() > 0) + } + Rule::Choice(elements) => { + let mut alternative_state_ids = Vec::new(); + for element in elements { + if self.expand_rule(element, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + for alternative_state_id in alternative_state_ids { + self.push_split(alternative_state_id); + } + Ok(true) + } + Rule::Seq(elements) => { + let mut result = false; + for element in elements.into_iter().rev() { + if self.expand_rule(element, next_state_id)? { + result = true; + } + next_state_id = self.nfa.last_state_id(); + } + Ok(result) + } + Rule::Repeat(rule) => { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_rule(rule, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); + Ok(true) + } else { + Ok(false) + } + } + Rule::Metadata { rule, params } => { + if let Some(precedence) = params.precedence { + self.precedence_stack.push(precedence); + } + let result = self.expand_rule(rule, next_state_id); + if params.precedence.is_some() { + self.precedence_stack.pop(); + } + result + } + Rule::Blank => Ok(false), + _ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))), + } + } + + fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result { + match ast { + Ast::Empty(_) => Ok(false), + Ast::Flags(_) => Err(Error::regex("Flags are not supported")), + Ast::Literal(literal) => { + self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id); + Ok(true) + } + Ast::Dot(_) => { + self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id); + Ok(true) + } + Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")), + Ast::Class(class) => match class { + Class::Unicode(_) => { + Err(Error::regex("Unicode character classes are not supported")) + } + Class::Perl(class) => { + let mut chars = self.expand_perl_character_class(&class.kind); + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); + Ok(true) + } + Class::Bracketed(class) => match &class.kind { + ClassSet::Item(item) => { + let mut chars = self.expand_character_class(&item)?; + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); + Ok(true) + } + ClassSet::BinaryOp(_) => Err(Error::regex( + "Binary operators in character classes aren't supported", + )), + }, + }, + Ast::Repetition(repetition) => match repetition.op.kind { + RepetitionKind::ZeroOrOne => { + self.expand_zero_or_one(&repetition.ast, next_state_id) + } + RepetitionKind::OneOrMore => { + self.expand_one_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::ZeroOrMore => { + self.expand_zero_or_more(&repetition.ast, next_state_id) + } + RepetitionKind::Range(RepetitionRange::Exactly(count)) => { + self.expand_count(&repetition.ast, count, next_state_id) + } + RepetitionKind::Range(RepetitionRange::AtLeast(min)) => { + if self.expand_zero_or_more(&repetition.ast, next_state_id)? { + self.expand_count(&repetition.ast, min, next_state_id) + } else { + Ok(false) + } + } + RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => { + let mut result = self.expand_count(&repetition.ast, min, next_state_id)?; + for _ in min..max { + if result { + next_state_id = self.nfa.last_state_id(); + } + if self.expand_zero_or_one(&repetition.ast, next_state_id)? { + result = true; + } + } + Ok(result) + } + }, + Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()), + Ast::Alternation(alternation) => { + let mut alternative_state_ids = Vec::new(); + for ast in alternation.asts.iter() { + if self.expand_regex(&ast, next_state_id)? { + alternative_state_ids.push(self.nfa.last_state_id()); + } else { + alternative_state_ids.push(next_state_id); + } + } + alternative_state_ids.sort_unstable(); + alternative_state_ids.dedup(); + alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); + + for alternative_state_id in alternative_state_ids { + self.push_split(alternative_state_id); + } + Ok(true) + } + Ast::Concat(concat) => { + let mut result = false; + for ast in concat.asts.iter().rev() { + if self.expand_regex(&ast, next_state_id)? { + result = true; + next_state_id = self.nfa.last_state_id(); + } + } + Ok(result) + } + } + } + + fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + self.nfa.states.push(NfaState::Accept { + variable_index: 0, + precedence: 0, + }); // Placeholder for split + let split_state_id = self.nfa.last_state_id(); + if self.expand_regex(&ast, split_state_id)? { + self.nfa.states[split_state_id as usize] = + NfaState::Split(self.nfa.last_state_id(), next_state_id); + Ok(true) + } else { + self.nfa.states.pop(); + Ok(false) + } + } + + fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_regex(ast, next_state_id)? { + self.push_split(next_state_id); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result { + if self.expand_one_or_more(&ast, next_state_id)? { + self.push_split(next_state_id); + Ok(true) + } else { + Ok(false) + } + } + + fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result { + let mut result = false; + for _ in 0..count { + if self.expand_regex(ast, next_state_id)? { + result = true; + next_state_id = self.nfa.last_state_id(); + } + } + Ok(result) + } + + fn expand_character_class(&self, item: &ClassSetItem) -> Result { + match item { + ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())), + ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])), + ClassSetItem::Range(range) => { + Ok(CharacterSet::empty().add_range(range.start.c, range.end.c)) + } + ClassSetItem::Union(union) => { + let mut result = CharacterSet::empty(); + for item in &union.items { + result = result.add(&self.expand_character_class(&item)?); + } + Ok(result) + } + ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)), + _ => Err(Error::regex(&format!( + "Unsupported character class syntax {:?}", + item + ))), + } + } + + fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet { + match item { + ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'), + ClassPerlKind::Space => CharacterSet::empty() + .add_char(' ') + .add_char('\t') + .add_char('\r') + .add_char('\n'), + ClassPerlKind::Word => CharacterSet::empty() + .add_char('_') + .add_range('A', 'Z') + .add_range('a', 'z') + .add_range('0', '9'), + } + } + + fn push_advance(&mut self, chars: CharacterSet, state_id: u32) { + let precedence = *self.precedence_stack.last().unwrap(); + self.nfa.states.push(NfaState::Advance { + chars, + state_id, + precedence, + is_sep: self.is_sep, + }); + } + + fn push_split(&mut self, state_id: u32) { + let last_state_id = self.nfa.last_state_id(); + self.nfa + .states + .push(NfaState::Split(state_id, last_state_id)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::Variable; + use crate::nfa::{NfaCursor, NfaTransition}; + + fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { + let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); + let mut cursor = NfaCursor::new(&grammar.nfa, start_states); + + let mut result = None; + let mut result_precedence = i32::MIN; + let mut start_char = 0; + let mut end_char = 0; + for c in s.chars() { + for (id, precedence) in cursor.completions() { + if result.is_none() || result_precedence <= precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = precedence; + } + } + if let Some(NfaTransition { + states, + is_separator, + .. + }) = cursor + .transitions() + .into_iter() + .find(|t| t.characters.contains(c) && t.precedence >= result_precedence) + { + cursor.reset(states); + end_char += 1; + if is_separator { + start_char = end_char; + } + } else { + break; + } + } + + for (id, precedence) in cursor.completions() { + if result.is_none() || result_precedence <= precedence { + result = Some((id, &s[start_char..end_char])); + result_precedence = precedence; + } + } + + result + } + + #[test] + fn test_rule_expansion() { + struct Row { + rules: Vec, + separators: Vec, + examples: Vec<(&'static str, Option<(usize, &'static str)>)>, + } + + let table = [ + // regex with sequences and alternatives + Row { + rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")], + separators: vec![], + examples: vec![ + ("ade1", Some((0, "ade"))), + ("bdf1", Some((0, "bdf"))), + ("bdfh1", Some((0, "bdfh"))), + ("ad1", None), + ], + }, + // regex with repeats + Row { + rules: vec![Rule::pattern("a*")], + separators: vec![], + examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))], + }, + // regex with repeats in sequences + Row { + rules: vec![Rule::pattern("a((bc)+|(de)*)f")], + separators: vec![], + examples: vec![ + ("af1", Some((0, "af"))), + ("adedef1", Some((0, "adedef"))), + ("abcbcbcf1", Some((0, "abcbcbcf"))), + ("a", None), + ], + }, + // regex with character ranges + Row { + rules: vec![Rule::pattern("[a-fA-F0-9]+")], + separators: vec![], + examples: vec![("A1ff0.", Some((0, "A1ff0")))], + }, + // regex with perl character classes + Row { + rules: vec![Rule::pattern("\\w\\d\\s")], + separators: vec![], + examples: vec![("_0 ", Some((0, "_0 ")))], + }, + // string + Row { + rules: vec![Rule::string("abc")], + separators: vec![], + examples: vec![("abcd", Some((0, "abc"))), ("ab", None)], + }, + // complex rule containing strings and regexes + Row { + rules: vec![Rule::repeat(Rule::seq(vec![ + Rule::string("{"), + Rule::pattern("[a-f]+"), + Rule::string("}"), + ]))], + separators: vec![], + examples: vec![ + ("{a}{", Some((0, "{a}"))), + ("{a}{d", Some((0, "{a}"))), + ("ab", None), + ], + }, + // longest match rule + Row { + rules: vec![ + Rule::pattern("a|bc"), + Rule::pattern("aa"), + Rule::pattern("bcd"), + ], + separators: vec![], + examples: vec![ + ("a.", Some((0, "a"))), + ("bc.", Some((0, "bc"))), + ("aa.", Some((1, "aa"))), + ("bcd?", Some((2, "bcd"))), + ("b.", None), + ("c.", None), + ], + }, + // regex with an alternative including the empty string + Row { + rules: vec![Rule::pattern("a(b|)+c")], + separators: vec![], + examples: vec![ + ("ac.", Some((0, "ac"))), + ("abc.", Some((0, "abc"))), + ("abbc.", Some((0, "abbc"))), + ], + }, + // separators + Row { + rules: vec![Rule::pattern("[a-f]+")], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], + examples: vec![ + (" a", Some((0, "a"))), + (" \nb", Some((0, "b"))), + (" \\a", None), + (" \\\na", Some((0, "a"))), + ], + }, + // shorter tokens with higher precedence + Row { + rules: vec![ + Rule::prec(2, Rule::pattern("abc")), + Rule::prec(1, Rule::pattern("ab[cd]e")), + Rule::pattern("[a-e]+"), + ], + separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")], + examples: vec![ + ("abceef", Some((0, "abc"))), + ("abdeef", Some((1, "abde"))), + ("aeeeef", Some((2, "aeeee"))), + ], + }, + // immediate tokens with higher precedence + Row { + rules: vec![ + Rule::prec(1, Rule::pattern("[^a]+")), + Rule::immediate_token(Rule::prec(2, Rule::pattern("[^ab]+"))), + ], + separators: vec![Rule::pattern("\\s")], + examples: vec![("cccb", Some((1, "ccc")))], + }, + Row { + rules: vec![Rule::seq(vec![ + Rule::string("a"), + Rule::choice(vec![Rule::string("b"), Rule::string("c")]), + Rule::string("d"), + ])], + separators: vec![], + examples: vec![ + ("abd", Some((0, "abd"))), + ("acd", Some((0, "acd"))), + ("abc", None), + ("ad", None), + ("d", None), + ("a", None), + ], + }, + // nested choices within sequences + Row { + rules: vec![Rule::seq(vec![ + Rule::pattern("[0-9]+"), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![Rule::seq(vec![ + Rule::choice(vec![Rule::string("e"), Rule::string("E")]), + Rule::choice(vec![ + Rule::Blank, + Rule::choice(vec![Rule::string("+"), Rule::string("-")]), + ]), + Rule::pattern("[0-9]+"), + ])]), + ]), + ])], + separators: vec![], + examples: vec![ + ("12", Some((0, "12"))), + ("12e", Some((0, "12"))), + ("12g", Some((0, "12"))), + ("12e3", Some((0, "12e3"))), + ("12e+", Some((0, "12"))), + ("12E+34 +", Some((0, "12E+34"))), + ("12e34", Some((0, "12e34"))), + ], + }, + ]; + + for Row { + rules, + separators, + examples, + } in &table + { + let grammar = expand_tokens(ExtractedLexicalGrammar { + separators: separators.clone(), + variables: rules + .into_iter() + .map(|rule| Variable::named("", rule.clone())) + .collect(), + }) + .unwrap(); + + for (haystack, needle) in examples.iter() { + assert_eq!(simulate_nfa(&grammar, haystack), *needle); + } + } + } +} diff --git a/cli/src/prepare_grammar/extract_simple_aliases.rs b/cli/src/prepare_grammar/extract_simple_aliases.rs new file mode 100644 index 00000000..aa8b3f77 --- /dev/null +++ b/cli/src/prepare_grammar/extract_simple_aliases.rs @@ -0,0 +1,199 @@ +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; + +#[derive(Clone, Default)] +struct SymbolStatus { + alias: Option, + conflicting: bool, +} + +pub(super) fn extract_simple_aliases( + syntax_grammar: &mut SyntaxGrammar, + lexical_grammar: &LexicalGrammar +) -> AliasMap { + // Determine which symbols in the grammars are *always* aliased to a single name. + let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; + let mut non_terminal_status_list = vec![SymbolStatus::default(); syntax_grammar.variables.len()]; + let mut external_status_list = vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; + for variable in syntax_grammar.variables.iter() { + for production in variable.productions.iter() { + for step in production.steps.iter() { + let mut status = match step.symbol { + Symbol { kind: SymbolType::External, index} => &mut external_status_list[index], + Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index], + Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), + }; + + if step.alias.is_none() { + status.alias = None; + status.conflicting = true; + } + + if !status.conflicting { + if status.alias.is_none() { + status.alias = step.alias.clone(); + } else if status.alias != step.alias { + status.alias = None; + status.conflicting = true; + } + } + } + } + } + + // Remove the aliases for those symbols. + for variable in syntax_grammar.variables.iter_mut() { + for production in variable.productions.iter_mut() { + for step in production.steps.iter_mut() { + let status = match step.symbol { + Symbol { kind: SymbolType::External, index} => &external_status_list[index], + Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index], + Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index], + Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"), + }; + + if status.alias.is_some() { + step.alias = None; + } + } + } + } + + // Populate a map of the symbols to their aliases. + let mut result = AliasMap::new(); + for (i, status) in terminal_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::terminal(i), alias); + } + } + for (i, status) in non_terminal_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::non_terminal(i), alias); + } + } + for (i, status) in external_status_list.into_iter().enumerate() { + if let Some(alias) = status.alias { + result.insert(Symbol::external(i), alias); + } + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep}; + use crate::nfa::Nfa; + + #[test] + fn test_extract_simple_aliases() { + let mut syntax_grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }, + ], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + // Token 0 is always aliased as "a1". + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + + // Token 1 is aliased above, but not here. + ProductionStep::new(Symbol::terminal(1)), + + // Token 2 is aliased differently than above. + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }, + ], + }, + ], + extra_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let lexical_grammar = LexicalGrammar { + nfa: Nfa::new(), + variables: vec![ + LexicalVariable { + name: "t1".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t2".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t3".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + } + ], + }; + + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); + assert_eq!(simple_aliases.len(), 1); + assert_eq!(simple_aliases[&Symbol::terminal(0)], Alias { + value: "a1".to_string(), + is_named: true, + }); + + assert_eq!(syntax_grammar.variables, vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + // 'Simple' alias removed + ProductionStep::new(Symbol::terminal(0)), + + // Other aliases unchanged + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ], + }, + ], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)), + ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), + ], + }, + ], + }, + ]); + } +} diff --git a/cli/src/prepare_grammar/extract_tokens.rs b/cli/src/prepare_grammar/extract_tokens.rs new file mode 100644 index 00000000..5a54d34e --- /dev/null +++ b/cli/src/prepare_grammar/extract_tokens.rs @@ -0,0 +1,525 @@ +use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; +use crate::error::{Error, Result}; +use crate::grammars::{ExternalToken, Variable, VariableType}; +use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; +use hashbrown::HashMap; +use std::mem; + +pub(super) fn extract_tokens( + mut grammar: InternedGrammar, +) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { + let mut extractor = TokenExtractor { + current_variable_name: String::new(), + current_variable_token_count: 0, + extracted_variables: Vec::new(), + extracted_usage_counts: Vec::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + for mut variable in grammar.external_tokens.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len()); + for variable in extractor.extracted_variables { + lexical_variables.push(Variable { + name: variable.name, + kind: variable.kind, + rule: variable.rule, + }); + } + + // If a variable's entire rule was extracted as a token and that token didn't + // appear within any other rule, then remove that variable from the syntax + // grammar, giving its name to the token in the lexical grammar. Any symbols + // that pointed to that variable will need to be updated to point to the + // variable in the lexical grammar. Symbols that pointed to later variables + // will need to have their indices decremented. + let mut variables = Vec::new(); + let mut symbol_replacer = SymbolReplacer { + replacements: HashMap::new(), + }; + for (i, variable) in grammar.variables.into_iter().enumerate() { + if let Rule::Symbol(Symbol { + kind: SymbolType::Terminal, + index, + }) = variable.rule + { + if i > 0 && extractor.extracted_usage_counts[index] == 1 { + let mut lexical_variable = &mut lexical_variables[index]; + lexical_variable.kind = variable.kind; + lexical_variable.name = variable.name; + symbol_replacer.replacements.insert(i, index); + continue; + } + } + variables.push(variable); + } + + for variable in variables.iter_mut() { + variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule); + } + + let expected_conflicts = grammar + .expected_conflicts + .into_iter() + .map(|conflict| { + let mut result: Vec<_> = conflict + .iter() + .map(|symbol| symbol_replacer.replace_symbol(*symbol)) + .collect(); + result.sort_unstable(); + result.dedup(); + result + }) + .collect(); + + let variables_to_inline = grammar + .variables_to_inline + .into_iter() + .map(|symbol| symbol_replacer.replace_symbol(symbol)) + .collect(); + + let mut separators = Vec::new(); + let mut extra_tokens = Vec::new(); + for rule in grammar.extra_tokens { + if let Rule::Symbol(symbol) = rule { + let new_symbol = symbol_replacer.replace_symbol(symbol); + if new_symbol.is_non_terminal() { + return Err(Error(format!( + "Non-token symbol '{}' cannot be used as an extra token", + &variables[new_symbol.index].name + ))); + } else { + extra_tokens.push(new_symbol); + } + } else { + if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) { + extra_tokens.push(Symbol::terminal(index)); + } else { + separators.push(rule); + } + } + } + + let mut external_tokens = Vec::new(); + for external_token in grammar.external_tokens { + let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); + if let Rule::Symbol(symbol) = rule { + if symbol.is_non_terminal() { + return Err(Error(format!( + "Rule '{}' cannot be used as both an external token and a non-terminal rule", + &variables[symbol.index].name, + ))); + } + + if symbol.is_external() { + external_tokens.push(ExternalToken { + name: external_token.name, + kind: external_token.kind, + corresponding_internal_token: None, + }) + } else { + external_tokens.push(ExternalToken { + name: lexical_variables[symbol.index].name.clone(), + kind: external_token.kind, + corresponding_internal_token: Some(symbol), + }) + } + } else { + return Err(Error(format!( + "Non-symbol rules cannot be used as external tokens" + ))); + } + } + + let mut word_token = None; + if let Some(token) = grammar.word_token { + let token = symbol_replacer.replace_symbol(token); + if token.is_non_terminal() { + return Err(Error(format!( + "Non-terminal symbol '{}' cannot be used as the word token", + &variables[token.index].name + ))); + } + word_token = Some(token); + } + + Ok(( + ExtractedSyntaxGrammar { + variables, + expected_conflicts, + extra_tokens, + variables_to_inline, + external_tokens, + word_token, + }, + ExtractedLexicalGrammar { + variables: lexical_variables, + separators, + }, + )) +} + +struct TokenExtractor { + current_variable_name: String, + current_variable_token_count: usize, + extracted_variables: Vec, + extracted_usage_counts: Vec, +} + +struct SymbolReplacer { + replacements: HashMap, +} + +impl TokenExtractor { + fn extract_tokens_in_variable(&mut self, variable: &mut Variable) { + self.current_variable_name.clear(); + self.current_variable_name.push_str(&variable.name); + self.current_variable_token_count = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.extract_tokens_in_rule(&rule); + } + + fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule { + match input { + Rule::String(name) => self.extract_token(input, Some(name)).into(), + Rule::Pattern(..) => self.extract_token(input, None).into(), + Rule::Metadata { params, rule } => { + if params.is_token { + let mut params = params.clone(); + params.is_token = false; + + let mut string_value = None; + if let Rule::String(value) = rule.as_ref() { + string_value = Some(value); + } + + let rule_to_extract = if params == MetadataParams::default() { + rule.as_ref() + } else { + input + }; + + self.extract_token(rule_to_extract, string_value).into() + } else { + Rule::Metadata { + params: params.clone(), + rule: Box::new(self.extract_tokens_in_rule((&rule).clone())), + } + } + } + Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))), + Rule::Seq(elements) => Rule::Seq( + elements + .iter() + .map(|e| self.extract_tokens_in_rule(e)) + .collect(), + ), + Rule::Choice(elements) => Rule::Choice( + elements + .iter() + .map(|e| self.extract_tokens_in_rule(e)) + .collect(), + ), + _ => input.clone(), + } + } + + fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol { + for (i, variable) in self.extracted_variables.iter_mut().enumerate() { + if variable.rule == *rule { + self.extracted_usage_counts[i] += 1; + return Symbol::terminal(i); + } + } + + let index = self.extracted_variables.len(); + let variable = if let Some(string_value) = string_value { + Variable { + name: string_value.clone(), + kind: VariableType::Anonymous, + rule: rule.clone() + } + } else { + self.current_variable_token_count += 1; + Variable { + name: format!( + "{}_token{}", + &self.current_variable_name, self.current_variable_token_count + ), + kind: VariableType::Auxiliary, + rule: rule.clone(), + } + }; + + self.extracted_variables.push(variable); + self.extracted_usage_counts.push(1); + Symbol::terminal(index) + } +} + +impl SymbolReplacer { + fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), + Rule::Choice(elements) => Rule::Choice( + elements + .iter() + .map(|e| self.replace_symbols_in_rule(e)) + .collect(), + ), + Rule::Seq(elements) => Rule::Seq( + elements + .iter() + .map(|e| self.replace_symbols_in_rule(e)) + .collect(), + ), + Rule::Repeat(content) => Rule::Repeat(Box::new(self.replace_symbols_in_rule(content))), + Rule::Metadata { rule, params } => Rule::Metadata { + params: params.clone(), + rule: Box::new(self.replace_symbols_in_rule(rule)), + }, + _ => rule.clone(), + } + } + + fn replace_symbol(&self, symbol: Symbol) -> Symbol { + if !symbol.is_non_terminal() { + return symbol; + } + + if let Some(replacement) = self.replacements.get(&symbol.index) { + return Symbol::terminal(*replacement); + } + + let mut adjusted_index = symbol.index; + for (replaced_index, _) in self.replacements.iter() { + if *replaced_index < symbol.index { + adjusted_index -= 1; + } + } + + return Symbol::non_terminal(adjusted_index); + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::grammars::VariableType; + + #[test] + fn test_extraction() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named( + "rule_0", + Rule::repeat(Rule::seq(vec![ + Rule::string("a"), + Rule::pattern("b"), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::token(Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))), + ]), + ])), + ), + Variable::named("rule_1", Rule::pattern("e")), + Variable::named("rule_2", Rule::pattern("b")), + Variable::named( + "rule_3", + Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]), + ), + ])) + .unwrap(); + + assert_eq!( + syntax_grammar.variables, + vec![ + Variable::named( + "rule_0", + Rule::repeat(Rule::seq(vec![ + // The string "a" was replaced by a symbol referencing the lexical grammar + Rule::terminal(0), + // The pattern "b" was replaced by a symbol referencing the lexical grammar + Rule::terminal(1), + Rule::choice(vec![ + // The symbol referencing `rule_1` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(3), + // The symbol referencing `rule_2` had its index decremented because + // `rule_1` was moved to the lexical grammar. + Rule::non_terminal(1), + // The rule wrapped in `token` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(2), + ]) + ])) + ), + // The pattern "e" was only used in once place: as the definition of `rule_1`, + // so that rule was moved to the lexical grammar. The pattern "b" appeared in + // two places, so it was not moved into the lexical grammar. + Variable::named("rule_2", Rule::terminal(1)), + Variable::named( + "rule_3", + Rule::seq(vec![Rule::non_terminal(1), Rule::Blank,]) + ), + ] + ); + + assert_eq!( + lexical_grammar.variables, + vec![ + Variable::anonymous("a", Rule::string("a")), + Variable::auxiliary("rule_0_token1", Rule::pattern("b")), + Variable::auxiliary( + "rule_0_token2", + Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),])) + ), + Variable::named("rule_1", Rule::pattern("e")), + ] + ); + } + + #[test] + fn test_start_rule_is_token() { + let (syntax_grammar, lexical_grammar) = + extract_tokens(build_grammar(vec![Variable::named( + "rule_0", + Rule::string("hello"), + )])) + .unwrap(); + + assert_eq!( + syntax_grammar.variables, + vec![Variable::named("rule_0", Rule::terminal(0)),] + ); + assert_eq!( + lexical_grammar.variables, + vec![Variable::anonymous("hello", Rule::string("hello")),] + ) + } + + #[test] + fn test_extracting_extra_tokens() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::string("x")), + Variable::named("comment", Rule::pattern("//.*")), + ]); + grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)]; + + let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap(); + assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]); + assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]); + } + + #[test] + fn test_extract_externals() { + let mut grammar = build_grammar(vec![ + Variable::named( + "rule_0", + Rule::seq(vec![ + Rule::external(0), + Rule::string("a"), + Rule::non_terminal(1), + Rule::non_terminal(2), + ]), + ), + Variable::named("rule_1", Rule::string("b")), + Variable::named("rule_2", Rule::string("c")), + ]); + grammar.external_tokens = vec![ + Variable::named("external_0", Rule::external(0)), + Variable::anonymous("a", Rule::string("a")), + Variable::named("rule_2", Rule::non_terminal(2)), + ]; + + let (syntax_grammar, _) = extract_tokens(grammar).unwrap(); + + assert_eq!( + syntax_grammar.external_tokens, + vec![ + ExternalToken { + name: "external_0".to_string(), + kind: VariableType::Named, + corresponding_internal_token: None, + }, + ExternalToken { + name: "a".to_string(), + kind: VariableType::Anonymous, + corresponding_internal_token: Some(Symbol::terminal(0)), + }, + ExternalToken { + name: "rule_2".to_string(), + kind: VariableType::Named, + corresponding_internal_token: Some(Symbol::terminal(2)), + }, + ] + ); + } + + #[test] + fn test_error_on_non_terminal_symbol_extras() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::non_terminal(1)), + Variable::named("rule_1", Rule::non_terminal(2)), + Variable::named("rule_2", Rule::string("x")), + ]); + grammar.extra_tokens = vec![Rule::non_terminal(1)]; + + match extract_tokens(grammar) { + Err(Error(s)) => { + assert_eq!( + s, + "Non-token symbol 'rule_1' cannot be used as an extra token" + ); + } + _ => { + panic!("Expected an error but got no error"); + } + } + } + + #[test] + fn test_error_on_external_with_same_name_as_non_terminal() { + let mut grammar = build_grammar(vec![ + Variable::named( + "rule_0", + Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]), + ), + Variable::named( + "rule_1", + Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2)]), + ), + Variable::named("rule_2", Rule::string("a")), + ]); + grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; + + match extract_tokens(grammar) { + Err(Error(s)) => { + assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); + } + _ => { + panic!("Expected an error but got no error"); + } + } + } + + fn build_grammar(variables: Vec) -> InternedGrammar { + InternedGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/cli/src/prepare_grammar/flatten_grammar.rs b/cli/src/prepare_grammar/flatten_grammar.rs new file mode 100644 index 00000000..3ffef086 --- /dev/null +++ b/cli/src/prepare_grammar/flatten_grammar.rs @@ -0,0 +1,313 @@ +use super::ExtractedSyntaxGrammar; +use crate::error::Result; +use crate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable}; +use crate::rules::{Alias, Associativity, Rule}; + +struct RuleFlattener { + production: Production, + precedence_stack: Vec, + associativity_stack: Vec, + alias_stack: Vec, +} + +impl RuleFlattener { + fn new() -> Self { + Self { + production: Production { + steps: Vec::new(), + dynamic_precedence: 0, + }, + precedence_stack: Vec::new(), + associativity_stack: Vec::new(), + alias_stack: Vec::new(), + } + } + + fn flatten(mut self, rule: Rule) -> Production { + self.apply(rule, true); + self.production + } + + fn apply(&mut self, rule: Rule, at_end: bool) { + match rule { + Rule::Seq(members) => { + let last_index = members.len() - 1; + for (i, member) in members.into_iter().enumerate() { + self.apply(member, i == last_index && at_end); + } + } + Rule::Metadata { rule, params } => { + let mut has_precedence = false; + if let Some(precedence) = params.precedence { + has_precedence = true; + self.precedence_stack.push(precedence); + } + + let mut has_associativity = false; + if let Some(associativity) = params.associativity { + has_associativity = true; + self.associativity_stack.push(associativity); + } + + let mut has_alias = false; + if let Some(alias) = params.alias { + has_alias = true; + self.alias_stack.push(alias); + } + + if params.dynamic_precedence.abs() > self.production.dynamic_precedence.abs() { + self.production.dynamic_precedence = params.dynamic_precedence; + } + + self.apply(*rule, at_end); + + if has_precedence { + self.precedence_stack.pop(); + if !at_end { + self.production.steps.last_mut().unwrap().precedence = + self.precedence_stack.last().cloned().unwrap_or(0); + } + } + + if has_associativity { + self.associativity_stack.pop(); + if !at_end { + self.production.steps.last_mut().unwrap().associativity = + self.associativity_stack.last().cloned(); + } + } + + if has_alias { + self.alias_stack.pop(); + } + } + Rule::Symbol(symbol) => { + self.production.steps.push(ProductionStep { + symbol, + precedence: self.precedence_stack.last().cloned().unwrap_or(0), + associativity: self.associativity_stack.last().cloned(), + alias: self.alias_stack.last().cloned(), + }); + } + _ => (), + } + } +} + +fn extract_choices(rule: Rule) -> Vec { + match rule { + Rule::Seq(elements) => { + let mut result = vec![Rule::Blank]; + for element in elements { + let extraction = extract_choices(element); + let mut next_result = Vec::new(); + for entry in result { + for extraction_entry in extraction.iter() { + next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()])); + } + } + result = next_result; + } + result + } + Rule::Choice(elements) => { + let mut result = Vec::new(); + for element in elements { + for rule in extract_choices(element) { + result.push(rule); + } + } + result + } + Rule::Metadata { rule, params } => extract_choices(*rule) + .into_iter() + .map(|rule| Rule::Metadata { + rule: Box::new(rule), + params: params.clone(), + }) + .collect(), + _ => vec![rule], + } +} + +fn flatten_variable(variable: Variable) -> Result { + let mut productions = Vec::new(); + for rule in extract_choices(variable.rule) { + let production = RuleFlattener::new().flatten(rule); + if !productions.contains(&production) { + productions.push(production); + } + } + Ok(SyntaxVariable { + name: variable.name, + kind: variable.kind, + productions, + }) +} + +pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result { + let mut variables = Vec::new(); + for variable in grammar.variables { + variables.push(flatten_variable(variable)?); + } + Ok(SyntaxGrammar { + extra_tokens: grammar.extra_tokens, + expected_conflicts: grammar.expected_conflicts, + variables_to_inline: grammar.variables_to_inline, + external_tokens: grammar.external_tokens, + word_token: grammar.word_token, + variables, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::VariableType; + use crate::rules::Symbol; + + #[test] + fn test_flatten_grammar() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_left( + 101, + Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_right( + 102, + Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), + ), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ]), + ), + Rule::non_terminal(7), + ]), + }) + .unwrap(); + + assert_eq!( + result.productions, + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(3)) + .with_prec(102, Some(Associativity::Right)), + ProductionStep::new(Symbol::non_terminal(4)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ] + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(5)) + .with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ] + }, + ] + ); + } + + #[test] + fn test_flatten_grammar_with_maximum_dynamic_precedence() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::non_terminal(1), + Rule::prec_dynamic(101, Rule::seq(vec![ + Rule::non_terminal(2), + Rule::choice(vec![ + Rule::prec_dynamic(102, Rule::seq(vec![ + Rule::non_terminal(3), + Rule::non_terminal(4) + ])), + Rule::non_terminal(5), + ]), + Rule::non_terminal(6), + ])), + Rule::non_terminal(7), + ]) + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 102, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(3)), + ProductionStep::new(Symbol::non_terminal(4)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + Production { + dynamic_precedence: 101, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::non_terminal(5)), + ProductionStep::new(Symbol::non_terminal(6)), + ProductionStep::new(Symbol::non_terminal(7)), + ], + }, + ]); + } + + #[test] + fn test_flatten_grammar_with_final_precedence() { + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::prec_left(101, Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), + ProductionStep::new(Symbol::non_terminal(2)).with_prec(101, Some(Associativity::Left)), + ] + } + ]); + + let result = flatten_variable(Variable { + name: "test".to_string(), + kind: VariableType::Named, + rule: Rule::prec_left(101, Rule::seq(vec![ + Rule::non_terminal(1), + ])), + }).unwrap(); + + assert_eq!(result.productions, vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)), + ] + } + ]); + } +} diff --git a/cli/src/prepare_grammar/intern_symbols.rs b/cli/src/prepare_grammar/intern_symbols.rs new file mode 100644 index 00000000..2e6f5b1c --- /dev/null +++ b/cli/src/prepare_grammar/intern_symbols.rs @@ -0,0 +1,238 @@ +use super::InternedGrammar; +use crate::error::{Error, Result}; +use crate::grammars::{InputGrammar, Variable, VariableType}; +use crate::rules::{Rule, Symbol}; + +pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result { + let interner = Interner { grammar }; + + if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { + return Err(Error( + "Grammar's start rule must be visible".to_string(), + )); + } + + let mut variables = Vec::with_capacity(grammar.variables.len()); + for variable in grammar.variables.iter() { + variables.push(Variable { + name: variable.name.clone(), + kind: variable_type_for_name(&variable.name), + rule: interner.intern_rule(&variable.rule)?, + }); + } + + let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len()); + for external_token in grammar.external_tokens.iter() { + let rule = interner.intern_rule(&external_token)?; + let (name, kind) = if let Rule::NamedSymbol(name) = external_token { + (name.clone(), variable_type_for_name(&name)) + } else { + (String::new(), VariableType::Anonymous) + }; + external_tokens.push(Variable { name, kind, rule }); + } + + let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len()); + for extra_token in grammar.extra_tokens.iter() { + extra_tokens.push(interner.intern_rule(extra_token)?); + } + + let mut expected_conflicts = Vec::new(); + for conflict in grammar.expected_conflicts.iter() { + let mut interned_conflict = Vec::with_capacity(conflict.len()); + for name in conflict { + interned_conflict.push( + interner + .intern_name(&name) + .ok_or_else(|| Error::undefined_symbol(name))?, + ); + } + expected_conflicts.push(interned_conflict); + } + + let mut variables_to_inline = Vec::new(); + for name in grammar.variables_to_inline.iter() { + if let Some(symbol) = interner.intern_name(&name) { + variables_to_inline.push(symbol); + } + } + + let mut word_token = None; + if let Some(name) = grammar.word_token.as_ref() { + word_token = Some( + interner + .intern_name(&name) + .ok_or_else(|| Error::undefined_symbol(&name))?, + ); + } + + Ok(InternedGrammar { + variables, + external_tokens, + extra_tokens, + expected_conflicts, + variables_to_inline, + word_token, + }) +} + +struct Interner<'a> { + grammar: &'a InputGrammar, +} + +impl<'a> Interner<'a> { + fn intern_rule(&self, rule: &Rule) -> Result { + match rule { + Rule::Choice(elements) => { + let mut result = Vec::with_capacity(elements.len()); + for element in elements { + result.push(self.intern_rule(element)?); + } + Ok(Rule::Choice(result)) + } + Rule::Seq(elements) => { + let mut result = Vec::with_capacity(elements.len()); + for element in elements { + result.push(self.intern_rule(element)?); + } + Ok(Rule::Seq(result)) + } + Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))), + Rule::Metadata { rule, params } => Ok(Rule::Metadata { + rule: Box::new(self.intern_rule(rule)?), + params: params.clone(), + }), + + Rule::NamedSymbol(name) => { + if let Some(symbol) = self.intern_name(&name) { + Ok(Rule::Symbol(symbol)) + } else { + Err(Error::undefined_symbol(name)) + } + } + + _ => Ok(rule.clone()), + } + } + + fn intern_name(&self, symbol: &str) -> Option { + for (i, variable) in self.grammar.variables.iter().enumerate() { + if variable.name == symbol { + return Some(Symbol::non_terminal(i)); + } + } + + for (i, external_token) in self.grammar.external_tokens.iter().enumerate() { + if let Rule::NamedSymbol(name) = external_token { + if name == symbol { + return Some(Symbol::external(i)); + } + } + } + + return None; + } +} + +fn variable_type_for_name(name: &str) -> VariableType { + if name.starts_with("_") { + VariableType::Hidden + } else { + VariableType::Named + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_repeat_expansion() { + let grammar = intern_symbols(&build_grammar(vec![ + Variable::named("x", Rule::choice(vec![Rule::named("y"), Rule::named("_z")])), + Variable::named("y", Rule::named("_z")), + Variable::named("_z", Rule::string("a")), + ])) + .unwrap(); + + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "x", + Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]) + ), + Variable::named("y", Rule::non_terminal(2)), + Variable::hidden("_z", Rule::string("a")), + ] + ); + } + + #[test] + fn test_interning_external_token_names() { + // Variable `y` is both an internal and an external token. + // Variable `z` is just an external token. + let mut input_grammar = build_grammar(vec![ + Variable::named( + "w", + Rule::choice(vec![Rule::named("x"), Rule::named("y"), Rule::named("z")]), + ), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), + ]); + input_grammar + .external_tokens + .extend(vec![Rule::named("y"), Rule::named("z")]); + + let grammar = intern_symbols(&input_grammar).unwrap(); + + // Variable `y` is referred to by its internal index. + // Variable `z` is referred to by its external index. + assert_eq!( + grammar.variables, + vec![ + Variable::named( + "w", + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::external(1), + ]) + ), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), + ] + ); + + // The external token for `y` refers back to its internal index. + assert_eq!( + grammar.external_tokens, + vec![ + Variable::named("y", Rule::non_terminal(2)), + Variable::named("z", Rule::external(1)), + ] + ); + } + + #[test] + fn test_grammar_with_undefined_symbols() { + let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); + + match result { + Err(Error(message)) => assert_eq!(message, "Undefined symbol 'y'"), + _ => panic!("Expected an error but got none"), + } + } + + fn build_grammar(variables: Vec) -> InputGrammar { + InputGrammar { + variables, + name: "the_language".to_string(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } +} diff --git a/cli/src/prepare_grammar/mod.rs b/cli/src/prepare_grammar/mod.rs new file mode 100644 index 00000000..b0c1d2a3 --- /dev/null +++ b/cli/src/prepare_grammar/mod.rs @@ -0,0 +1,57 @@ +mod expand_repeats; +mod expand_tokens; +mod extract_simple_aliases; +mod extract_tokens; +mod flatten_grammar; +mod intern_symbols; +mod process_inlines; + +use self::expand_repeats::expand_repeats; +pub(crate) use self::expand_tokens::expand_tokens; +use self::extract_simple_aliases::extract_simple_aliases; +use self::extract_tokens::extract_tokens; +use self::flatten_grammar::flatten_grammar; +use self::intern_symbols::intern_symbols; +use self::process_inlines::process_inlines; +use crate::error::Result; +use crate::grammars::{ + ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable, +}; +use crate::rules::{AliasMap, Rule, Symbol}; + +pub(crate) struct IntermediateGrammar { + variables: Vec, + extra_tokens: Vec, + expected_conflicts: Vec>, + external_tokens: Vec, + variables_to_inline: Vec, + word_token: Option, +} + +pub(crate) type InternedGrammar = IntermediateGrammar; + +pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar; + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct ExtractedLexicalGrammar { + pub variables: Vec, + pub separators: Vec, +} + +pub(crate) fn prepare_grammar( + input_grammar: &InputGrammar, +) -> Result<( + SyntaxGrammar, + LexicalGrammar, + InlinedProductionMap, + AliasMap, +)> { + let interned_grammar = intern_symbols(input_grammar)?; + let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; + let syntax_grammar = expand_repeats(syntax_grammar); + let mut syntax_grammar = flatten_grammar(syntax_grammar)?; + let lexical_grammar = expand_tokens(lexical_grammar)?; + let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); + let inlines = process_inlines(&syntax_grammar); + Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases)) +} diff --git a/cli/src/prepare_grammar/process_inlines.rs b/cli/src/prepare_grammar/process_inlines.rs new file mode 100644 index 00000000..557b0fa4 --- /dev/null +++ b/cli/src/prepare_grammar/process_inlines.rs @@ -0,0 +1,479 @@ +use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; +use hashbrown::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct ProductionStepId { + // A `None` value here means that the production itself was produced via inlining, + // and is stored in the the builder's `productions` vector, as opposed to being + // stored in one of the grammar's variables. + variable_index: Option, + production_index: usize, + step_index: usize, +} + +struct InlinedProductionMapBuilder { + production_indices_by_step_id: HashMap>, + productions: Vec, +} + +impl InlinedProductionMapBuilder { + fn build<'a>(mut self, grammar: &'a SyntaxGrammar) -> InlinedProductionMap { + let mut step_ids_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + step_ids_to_process.push(ProductionStepId { + variable_index: Some(variable_index), + production_index, + step_index: 0, + }); + while !step_ids_to_process.is_empty() { + let mut i = 0; + while i < step_ids_to_process.len() { + let step_id = step_ids_to_process[i]; + if let Some(step) = self.production_step_for_id(step_id, grammar) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_step_ids = self + .inline_production_at_step(step_id, grammar) + .into_iter() + .cloned() + .map(|production_index| ProductionStepId { + variable_index: None, + production_index, + step_index: step_id.step_index, + }); + step_ids_to_process.splice(i..i + 1, inlined_step_ids); + } else { + step_ids_to_process[i] = ProductionStepId { + variable_index: step_id.variable_index, + production_index: step_id.production_index, + step_index: step_id.step_index + 1, + }; + i += 1; + } + } else { + step_ids_to_process.remove(i); + } + } + } + } + } + + let productions = self.productions; + let production_indices_by_step_id = self.production_indices_by_step_id; + let production_map = production_indices_by_step_id + .into_iter() + .map(|(step_id, production_indices)| { + let production = if let Some(variable_index) = step_id.variable_index { + &grammar.variables[variable_index].productions[step_id.production_index] + } else { + &productions[step_id.production_index] + } as *const Production; + ((production, step_id.step_index as u32), production_indices) + }) + .collect(); + + InlinedProductionMap { + productions, + production_map, + } + } + + fn inline_production_at_step<'a>( + &'a mut self, + step_id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> &'a Vec { + // Build a list of productions produced by inlining rules. + let mut i = 0; + let step_index = step_id.step_index; + let mut productions_to_add = vec![self.production_for_id(step_id, grammar).clone()]; + while i < productions_to_add.len() { + if let Some(step) = productions_to_add[i].steps.get(step_index) { + let symbol = step.symbol.clone(); + if grammar.variables_to_inline.contains(&symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); + + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[symbol.index].productions.iter().map(|p| { + let mut production = production.clone(); + let removed_step = production + .steps + .splice(step_index..(step_index + 1), p.steps.iter().cloned()) + .next() + .unwrap(); + let inserted_steps = + &mut production.steps[step_index..(step_index + p.steps.len())]; + if let Some(alias) = removed_step.alias { + for inserted_step in inserted_steps.iter_mut() { + inserted_step.alias = Some(alias.clone()); + } + } + if let Some(last_inserted_step) = inserted_steps.last_mut() { + if last_inserted_step.precedence == 0 { + last_inserted_step.precedence = removed_step.precedence; + } + if last_inserted_step.associativity == None { + last_inserted_step.associativity = removed_step.associativity; + } + } + production + }), + ); + + continue; + } + } + i += 1; + } + + // Store all the computed productions. + let result = productions_to_add + .into_iter() + .map(|production| { + self.productions + .iter() + .position(|p| *p == production) + .unwrap_or({ + self.productions.push(production); + self.productions.len() - 1 + }) + }) + .collect(); + + // Cache these productions based on the original production step. + self.production_indices_by_step_id + .entry(step_id) + .or_insert(result) + } + + fn production_for_id<'a>( + &'a self, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> &'a Production { + if let Some(variable_index) = id.variable_index { + &grammar.variables[variable_index].productions[id.production_index] + } else { + &self.productions[id.production_index] + } + } + + fn production_step_for_id<'a>( + &'a self, + id: ProductionStepId, + grammar: &'a SyntaxGrammar, + ) -> Option<&'a ProductionStep> { + self.production_for_id(id, grammar).steps.get(id.step_index) + } +} + +pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap { + InlinedProductionMapBuilder { + productions: Vec::new(), + production_indices_by_step_id: HashMap::new(), + } + .build(grammar) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::{Associativity, Symbol}; + + #[test] + fn test_basic_inlining() { + let grammar = SyntaxGrammar { + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + variables_to_inline: vec![Symbol::non_terminal(1)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(14))], + }, + ], + }, + ], + }; + let inline_map = process_inlines(&grammar); + + // Nothing to inline at step 0. + assert!(inline_map + .inlined_productions(&grammar.variables[0].productions[0], 0) + .is_none()); + + // Inlining variable 1 yields two productions. + assert_eq!( + inline_map + .inlined_productions(&grammar.variables[0].productions[0], 1) + .unwrap() + .cloned() + .collect::>(), + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(14)), + ProductionStep::new(Symbol::terminal(11)), + ], + }, + ] + ); + } + + #[test] + fn test_nested_inlining() { + let grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(3)), // inlined + ProductionStep::new(Symbol::terminal(14)), + ], + }, + ], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(15))], + }], + }, + SyntaxVariable { + name: "non-terminal-3".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(16))], + }], + }, + ], + variables_to_inline: vec![ + Symbol::non_terminal(1), + Symbol::non_terminal(2), + Symbol::non_terminal(3), + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + let inline_map = process_inlines(&grammar); + + let productions: Vec<&Production> = inline_map + .inlined_productions(&grammar.variables[0].productions[0], 1) + .unwrap() + .collect(); + + assert_eq!( + productions.iter().cloned().cloned().collect::>(), + vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::terminal(12)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(16)), + ProductionStep::new(Symbol::terminal(14)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), + ProductionStep::new(Symbol::terminal(12)), + ], + }, + ] + ); + + assert_eq!( + inline_map + .inlined_productions(productions[0], 3) + .unwrap() + .cloned() + .collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::terminal(13)), + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::terminal(15)), + ProductionStep::new(Symbol::terminal(12)), + ], + },] + ); + } + + #[test] + fn test_inlining_with_precedence_and_alias() { + let grammar = SyntaxGrammar { + variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)], + variables: vec![ + SyntaxVariable { + name: "non-terminal-0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + // inlined + ProductionStep::new(Symbol::non_terminal(1)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + // inlined + ProductionStep::new(Symbol::non_terminal(2)) + .with_alias("outer_alias", true), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-1".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)).with_prec(3, None), + ], + }], + }, + SyntaxVariable { + name: "non-terminal-2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }], + }, + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = process_inlines(&grammar); + + let productions: Vec<_> = inline_map + .inlined_productions(&grammar.variables[0].productions[0], 0) + .unwrap() + .collect(); + + assert_eq!( + productions.iter().cloned().cloned().collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + // The first step in the inlined production retains its precedence + // and alias. + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + // The final step of the inlined production inherits the precedence of + // the inlined step. + ProductionStep::new(Symbol::terminal(12)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(2)).with_alias("outer_alias", true), + ] + }], + ); + + assert_eq!( + inline_map + .inlined_productions(productions[0], 3) + .unwrap() + .cloned() + .collect::>(), + vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(11)) + .with_prec(2, None) + .with_alias("inner_alias", true), + ProductionStep::new(Symbol::terminal(12)) + .with_prec(1, Some(Associativity::Left)), + ProductionStep::new(Symbol::terminal(10)), + // All steps of the inlined production inherit their alias from the + // inlined step. + ProductionStep::new(Symbol::terminal(13)).with_alias("outer_alias", true), + ] + }], + ); + } +} diff --git a/cli/src/render/mod.rs b/cli/src/render/mod.rs new file mode 100644 index 00000000..36429848 --- /dev/null +++ b/cli/src/render/mod.rs @@ -0,0 +1,1034 @@ +use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::nfa::CharacterSet; +use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use core::ops::Range; +use hashbrown::{HashMap, HashSet}; +use std::fmt::Write; +use std::mem::swap; + +macro_rules! add { + ($this: tt, $($arg: tt)*) => {{ + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + }} +} + +macro_rules! add_whitespace { + ($this: tt) => {{ + for _ in 0..$this.indent_level { + write!(&mut $this.buffer, " ").unwrap(); + } + }}; +} + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + add_whitespace!($this); + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + $this.buffer += "\n"; + } +} + +macro_rules! indent { + ($this: tt) => { + $this.indent_level += 1; + }; +} + +macro_rules! dedent { + ($this: tt) => { + $this.indent_level -= 1; + }; +} + +struct Generator { + buffer: String, + indent_level: usize, + language_name: String, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, + symbol_ids: HashMap, + alias_ids: HashMap, + external_scanner_states: Vec>, + alias_map: HashMap>, +} + +impl Generator { + fn generate(mut self) -> String { + self.add_includes(); + self.add_pragmas(); + self.add_stats(); + self.add_symbol_enum(); + self.add_symbol_names_list(); + self.add_symbol_metadata_list(); + self.add_alias_sequences(); + + let mut main_lex_table = LexTable::default(); + swap(&mut main_lex_table, &mut self.main_lex_table); + self.add_lex_function("ts_lex", main_lex_table); + + if self.keyword_capture_token.is_some() { + let mut keyword_lex_table = LexTable::default(); + swap(&mut keyword_lex_table, &mut self.keyword_lex_table); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); + } + + self.add_lex_modes_list(); + + if !self.syntax_grammar.external_tokens.is_empty() { + self.add_external_token_enum(); + self.add_external_scanner_symbol_map(); + self.add_external_scanner_states_list(); + } + + self.add_parse_table(); + self.add_parser_export(); + + self.buffer + } + + fn add_includes(&mut self) { + add_line!(self, "#include "); + add_line!(self, ""); + } + + fn add_pragmas(&mut self) { + add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); + add_line!(self, "#pragma GCC diagnostic push"); + add_line!( + self, + "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"" + ); + add_line!(self, "#endif"); + add_line!(self, ""); + + // Compiling large lexer functions can be very slow, especially when + // using Visual Studio on Windows. Disabling optimizations is not + // ideal, but only a very small fraction of overall parse time is + // spent lexing, so the performance impact of this is pretty small. + if self.main_lex_table.states.len() > 500 { + add_line!(self, "#ifdef _MSC_VER"); + add_line!(self, "#pragma optimize(\"\", off)"); + add_line!(self, "#endif"); + add_line!(self, ""); + } + } + + fn add_stats(&mut self) { + let token_count = self + .parse_table + .symbols + .iter() + .filter(|symbol| { + if symbol.is_terminal() || symbol.is_eof() { + true + } else if symbol.is_external() { + self.syntax_grammar.external_tokens[symbol.index] + .corresponding_internal_token + .is_none() + } else { + false + } + }) + .count(); + + let mut symbol_identifiers = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); + } + + for alias_sequence in &self.parse_table.alias_sequences { + for entry in alias_sequence { + if let Some(alias) = entry { + let alias_kind = if alias.is_named { + VariableType::Named + } else { + VariableType::Anonymous + }; + let matching_symbol = self.parse_table.symbols.iter().cloned().find(|symbol| { + let (name, kind) = self.metadata_for_symbol(*symbol); + name == alias.value && kind == alias_kind + }); + let alias_id = if let Some(symbol) = matching_symbol { + self.symbol_ids[&symbol].clone() + } else if alias.is_named { + format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) + }; + self.alias_ids.entry(alias.clone()).or_insert(alias_id); + self.alias_map + .entry(alias.clone()) + .or_insert(matching_symbol); + } + } + } + + add_line!(self, "#define LANGUAGE_VERSION {}", 9); + add_line!( + self, + "#define STATE_COUNT {}", + self.parse_table.states.len() + ); + add_line!( + self, + "#define SYMBOL_COUNT {}", + self.parse_table.symbols.len() + ); + add_line!( + self, + "#define ALIAS_COUNT {}", + self.alias_map.iter().filter(|e| e.1.is_none()).count() + ); + add_line!(self, "#define TOKEN_COUNT {}", token_count); + add_line!( + self, + "#define EXTERNAL_TOKEN_COUNT {}", + self.syntax_grammar.external_tokens.len() + ); + if self.parse_table.max_aliased_production_length > 0 { + add_line!( + self, + "#define MAX_ALIAS_SEQUENCE_LENGTH {}", + self.parse_table.max_aliased_production_length + ); + } + add_line!(self, ""); + } + + fn add_symbol_enum(&mut self) { + add_line!(self, "enum {{"); + indent!(self); + let mut i = 1; + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!(self, "{} = {},", self.symbol_ids[&symbol], i); + i += 1; + } + } + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!(self, "{} = {},", self.alias_ids[&alias], i); + } + i += 1; + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_symbol_names_list(&mut self) { + add_line!(self, "static const char *ts_symbol_names[] = {{"); + indent!(self); + for symbol in self.parse_table.symbols.iter() { + let name = self.sanitize_string( + self.simple_aliases + .get(symbol) + .map(|alias| alias.value.as_str()) + .unwrap_or(self.metadata_for_symbol(*symbol).0), + ); + add_line!(self, "[{}] = \"{}\",", self.symbol_ids[&symbol], name); + } + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!( + self, + "[{}] = \"{}\",", + self.alias_ids[&alias], + self.sanitize_string(&alias.value) + ); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_symbol_metadata_list(&mut self) { + add_line!( + self, + "static const TSSymbolMetadata ts_symbol_metadata[] = {{" + ); + indent!(self); + for symbol in &self.parse_table.symbols { + add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); + indent!(self); + if let Some(Alias { is_named, .. }) = self.simple_aliases.get(symbol) { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", is_named); + } else { + match self.metadata_for_symbol(*symbol).1 { + VariableType::Named => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = true,"); + } + VariableType::Anonymous => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = false,"); + } + VariableType::Hidden => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = true,"); + } + VariableType::Auxiliary => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = false,"); + } + } + } + dedent!(self); + add_line!(self, "}},"); + } + for (alias, matching_symbol) in &self.alias_map { + if matching_symbol.is_none() { + add_line!(self, "[{}] = {{", self.alias_ids[&alias]); + indent!(self); + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", alias.is_named); + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_alias_sequences(&mut self) { + add_line!( + self, + "static TSSymbol ts_alias_sequences[{}][MAX_ALIAS_SEQUENCE_LENGTH] = {{", + self.parse_table.alias_sequences.len() + ); + indent!(self); + for (i, sequence) in self.parse_table.alias_sequences.iter().enumerate().skip(1) { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (j, alias) in sequence.iter().enumerate() { + if let Some(alias) = alias { + add_line!(self, "[{}] = {},", j, self.alias_ids[&alias]); + } + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { + add_line!( + self, + "static bool {}(TSLexer *lexer, TSStateId state) {{", + name + ); + indent!(self); + add_line!(self, "START_LEXER();"); + add_line!(self, "switch (state) {{"); + indent!(self); + + for (i, state) in lex_table.states.into_iter().enumerate() { + add_line!(self, "case {}:", i); + indent!(self); + self.add_lex_state(i, state); + dedent!(self); + } + + add_line!(self, "default:"); + indent!(self); + add_line!(self, "return false;"); + dedent!(self); + + dedent!(self); + add_line!(self, "}}"); + dedent!(self); + add_line!(self, "}}"); + add_line!(self, ""); + } + + fn add_lex_state(&mut self, index: usize, state: LexState) { + if let Some(accept_action) = state.accept_action { + add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); + } + + let mut ruled_out_characters = HashSet::new(); + for (characters, action) in state.advance_actions { + let previous_length = self.buffer.len(); + + add_whitespace!(self); + add!(self, "if ("); + if self.add_character_set_condition(&characters, &ruled_out_characters) { + add!(self, ")\n"); + indent!(self); + self.add_advance_action(index, &action); + if let CharacterSet::Include(chars) = characters { + ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); + } + dedent!(self); + } else { + self.buffer.truncate(previous_length); + self.add_advance_action(index, &action); + } + } + + add_line!(self, "END_STATE();"); + } + + fn add_character_set_condition( + &mut self, + characters: &CharacterSet, + ruled_out_characters: &HashSet, + ) -> bool { + match characters { + CharacterSet::Include(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, false) + } + CharacterSet::Exclude(chars) => { + let ranges = Some('\0'..'\0') + .into_iter() + .chain(Self::get_ranges(chars, ruled_out_characters)); + self.add_character_range_conditions(ranges, true) + } + } + } + + fn add_character_range_conditions( + &mut self, + ranges: impl Iterator>, + is_negated: bool, + ) -> bool { + let line_break = "\n "; + let mut did_add = false; + for range in ranges { + if is_negated { + if did_add { + add!(self, " &&{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead != "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead != "); + self.add_character(range.start); + add!(self, " &&{}lookahead != ", line_break); + self.add_character(range.end); + } else { + add!(self, "(lookahead < "); + self.add_character(range.start); + add!(self, " || "); + self.add_character(range.end); + add!(self, " < lookahead)"); + } + } else { + if did_add { + add!(self, " ||{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead == "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead == "); + self.add_character(range.start); + add!(self, " ||{}lookahead == ", line_break); + self.add_character(range.end); + } else { + add!(self, "("); + self.add_character(range.start); + add!(self, " <= lookahead && lookahead <= "); + self.add_character(range.end); + add!(self, ")"); + } + } + did_add = true; + } + did_add + } + + fn get_ranges<'a>( + chars: &'a Vec, + ruled_out_characters: &'a HashSet, + ) -> impl Iterator> + 'a { + let mut prev_range: Option> = None; + chars + .iter() + .map(|c| (*c, false)) + .chain(Some(('\0', true))) + .filter_map(move |(c, done)| { + if done { + return prev_range.clone(); + } + if ruled_out_characters.contains(&(c as u32)) { + return None; + } + if let Some(range) = prev_range.clone() { + let mut prev_range_successor = range.end as u32 + 1; + while prev_range_successor < c as u32 { + if !ruled_out_characters.contains(&prev_range_successor) { + prev_range = Some(c..c); + return Some(range); + } + prev_range_successor += 1; + } + prev_range = Some(range.start..c); + None + } else { + prev_range = Some(c..c); + None + } + }) + } + + fn add_advance_action(&mut self, index: usize, action: &AdvanceAction) { + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state.unwrap_or(index)); + } else { + add_line!(self, "SKIP({});", action.state.unwrap_or(index)); + } + } + + fn add_lex_modes_list(&mut self) { + self.get_external_scanner_state_id(HashSet::new()); + + let mut external_tokens_by_corresponding_internal_token = HashMap::new(); + for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() { + if let Some(symbol) = external_token.corresponding_internal_token { + external_tokens_by_corresponding_internal_token.insert(symbol.index, i); + } + } + + add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{"); + indent!(self); + for i in 0..self.parse_table.states.len() { + let mut external_tokens = HashSet::new(); + for token in self.parse_table.states[i].terminal_entries.keys() { + if token.is_external() { + external_tokens.insert(token.index); + } else if token.is_terminal() { + if let Some(external_index) = + external_tokens_by_corresponding_internal_token.get(&token.index) + { + external_tokens.insert(*external_index); + } + } + } + + let external_state_id = self.get_external_scanner_state_id(external_tokens); + let state = &self.parse_table.states[i]; + if external_state_id > 0 { + add_line!( + self, + "[{}] = {{.lex_state = {}, .external_lex_state = {}}},", + i, + state.lex_state_id, + external_state_id + ); + } else { + add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_token_enum(&mut self) { + add_line!(self, "enum {{"); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "{} = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + i + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_symbol_map(&mut self) { + add_line!( + self, + "static TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {{" + ); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + let token = &self.syntax_grammar.external_tokens[i]; + let id_token = token.corresponding_internal_token.unwrap_or(Symbol::external(i)); + add_line!( + self, + "[{}] = {},", + self.external_token_id(&token), + self.symbol_ids[&id_token], + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_states_list(&mut self) { + add_line!( + self, + "static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{", + self.external_scanner_states.len(), + ); + indent!(self); + for i in 0..self.external_scanner_states.len() { + if !self.external_scanner_states[i].is_empty() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for token_index in &self.external_scanner_states[i] { + add_line!( + self, + "[{}] = true,", + self.external_token_id(&self.syntax_grammar.external_tokens[*token_index]) + ); + } + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parse_table(&mut self) { + let mut parse_table_entries = Vec::new(); + let mut next_parse_action_list_index = 0; + + self.get_parse_action_list_id( + &ParseTableEntry { + actions: Vec::new(), + reusable: false, + }, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + + add_line!( + self, + "static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{" + ); + indent!(self); + for (i, state) in self.parse_table.states.iter().enumerate() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (symbol, state_id) in &state.nonterminal_entries { + add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id); + } + for (symbol, entry) in &state.terminal_entries { + let entry_id = self.get_parse_action_list_id( + entry, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + add_line!( + self, + "[{}] = ACTIONS({}),", + self.symbol_ids[symbol], + entry_id + ); + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + + self.add_parse_action_list(parse_table_entries); + } + + fn add_parse_action_list(&mut self, parse_table_entries: Vec<(usize, ParseTableEntry)>) { + add_line!(self, "static TSParseActionEntry ts_parse_actions[] = {{"); + indent!(self); + for (i, entry) in parse_table_entries { + add!( + self, + " [{}] = {{.count = {}, .reusable = {}}},", + i, + entry.actions.len(), + entry.reusable + ); + for action in entry.actions { + add!(self, " "); + match action { + ParseAction::Accept => add!(self, " ACCEPT_INPUT()"), + ParseAction::Recover => add!(self, "RECOVER()"), + ParseAction::ShiftExtra => add!(self, "SHIFT_EXTRA()"), + ParseAction::Shift { + state, + is_repetition, + } => { + if is_repetition { + add!(self, "SHIFT_REPEAT({})", state); + } else { + add!(self, "SHIFT({})", state); + } + } + ParseAction::Reduce { + symbol, + child_count, + dynamic_precedence, + alias_sequence_id, + .. + } => { + add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); + if dynamic_precedence != 0 { + add!(self, ", .dynamic_precedence = {}", dynamic_precedence); + } + if alias_sequence_id != 0 { + add!(self, ", .alias_sequence_id = {}", alias_sequence_id); + } + add!(self, ")"); + } + } + add!(self, ",") + } + add!(self, "\n"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parser_export(&mut self) { + let language_function_name = format!("tree_sitter_{}", self.language_name); + let external_scanner_name = format!("{}_external_scanner", language_function_name); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, "void *{}_create();", external_scanner_name); + add_line!(self, "void {}_destroy(void *);", external_scanner_name); + add_line!( + self, + "bool {}_scan(void *, TSLexer *, const bool *);", + external_scanner_name + ); + add_line!( + self, + "unsigned {}_serialize(void *, char *);", + external_scanner_name + ); + add_line!( + self, + "void {}_deserialize(void *, const char *, unsigned);", + external_scanner_name + ); + add_line!(self, ""); + } + + add_line!(self, "#ifdef _WIN32"); + add_line!(self, "#define extern __declspec(dllexport)"); + add_line!(self, "#endif"); + add_line!(self, ""); + + add_line!( + self, + "extern const TSLanguage *{}() {{", + language_function_name + ); + indent!(self); + add_line!(self, "static TSLanguage language = {{"); + indent!(self); + add_line!(self, ".version = LANGUAGE_VERSION,"); + add_line!(self, ".symbol_count = SYMBOL_COUNT,"); + add_line!(self, ".alias_count = ALIAS_COUNT,"); + add_line!(self, ".token_count = TOKEN_COUNT,"); + add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); + add_line!( + self, + ".parse_table = (const unsigned short *)ts_parse_table," + ); + add_line!(self, ".parse_actions = ts_parse_actions,"); + add_line!(self, ".lex_modes = ts_lex_modes,"); + add_line!(self, ".symbol_names = ts_symbol_names,"); + add_line!( + self, + ".alias_sequences = (const TSSymbol *)ts_alias_sequences," + ); + + add_line!( + self, + ".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH," + ); + add_line!(self, ".lex_fn = ts_lex,"); + + if let Some(keyword_capture_token) = self.keyword_capture_token { + add_line!(self, ".keyword_lex_fn = ts_lex_keywords,"); + add_line!( + self, + ".keyword_capture_token = {},", + self.symbol_ids[&keyword_capture_token] + ); + } + + add_line!(self, ".external_token_count = EXTERNAL_TOKEN_COUNT,"); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, ".external_scanner = {{"); + indent!(self); + add_line!(self, "(const bool *)ts_external_scanner_states,"); + add_line!(self, "ts_external_scanner_symbol_map,"); + add_line!(self, "{}_create,", external_scanner_name); + add_line!(self, "{}_destroy,", external_scanner_name); + add_line!(self, "{}_scan,", external_scanner_name); + add_line!(self, "{}_serialize,", external_scanner_name); + add_line!(self, "{}_deserialize,", external_scanner_name); + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + + add_line!(self, "}};"); + add_line!(self, "return &language;"); + dedent!(self); + add_line!(self, "}}"); + } + + fn get_parse_action_list_id( + &self, + entry: &ParseTableEntry, + parse_table_entries: &mut Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: &mut usize, + ) -> usize { + if let Some((index, _)) = parse_table_entries.iter().find(|(_, e)| *e == *entry) { + return *index; + } + + let result = *next_parse_action_list_index; + parse_table_entries.push((result, entry.clone())); + *next_parse_action_list_index += 1 + entry.actions.len(); + result + } + + fn get_external_scanner_state_id(&mut self, external_tokens: HashSet) -> usize { + self.external_scanner_states + .iter() + .position(|tokens| *tokens == external_tokens) + .unwrap_or_else(|| { + self.external_scanner_states.push(external_tokens); + self.external_scanner_states.len() - 1 + }) + } + + fn external_token_id(&self, token: &ExternalToken) -> String { + format!( + "ts_external_token_{}", + self.sanitize_identifier(&token.name) + ) + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = self.metadata_for_symbol(symbol); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", self.sanitize_identifier(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_identifiers.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_identifiers.insert(id.clone()); + self.symbol_ids.insert(symbol, id); + } + + fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { + match symbol.kind { + SymbolType::End => ("end", VariableType::Hidden), + SymbolType::NonTerminal => { + let variable = &self.syntax_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::Terminal => { + let variable = &self.lexical_grammar.variables[symbol.index]; + (&variable.name, variable.kind) + } + SymbolType::External => { + let token = &self.syntax_grammar.external_tokens[symbol.index]; + (&token.name, token.kind) + } + } + } + + fn sanitize_identifier(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + { + result.push(c); + } else { + let replacement = match c { + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + _ => continue, + }; + if !result.is_empty() && !result.ends_with("_") { + result.push('_'); + } + result += replacement; + } + } + result + } + + fn sanitize_string(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ['\\', '\n', '\r', '\"'].contains(&c) { + result.push('\\'); + } + result.push(c); + } + result + } + + fn add_character(&mut self, c: char) { + if c.is_ascii() { + match c { + '\0' => add!(self, "0"), + '\'' => add!(self, "'\\''"), + '\\' => add!(self, "'\\\\'"), + '\t' => add!(self, "'\\t'"), + '\n' => add!(self, "'\\n'"), + '\r' => add!(self, "'\\r'"), + _ => add!(self, "'{}'", c), + } + } else { + add!(self, "{}", c as u32) + } + } +} + +pub(crate) fn render_c_code( + name: &str, + parse_table: ParseTable, + main_lex_table: LexTable, + keyword_lex_table: LexTable, + keyword_capture_token: Option, + syntax_grammar: SyntaxGrammar, + lexical_grammar: LexicalGrammar, + simple_aliases: AliasMap, +) -> String { + Generator { + buffer: String::new(), + indent_level: 0, + language_name: name.to_string(), + parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + syntax_grammar, + lexical_grammar, + simple_aliases, + symbol_ids: HashMap::new(), + alias_ids: HashMap::new(), + external_scanner_states: Vec::new(), + alias_map: HashMap::new(), + } + .generate() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_char_ranges() { + struct Row { + chars: Vec, + ruled_out_chars: Vec, + expected_ranges: Vec>, + } + + let table = [ + Row { + chars: vec!['a'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'a'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'z'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'h', 'z'], + ruled_out_chars: vec!['d', 'f', 'g'], + expected_ranges: vec!['a'..'h', 'z'..'z'], + }, + ]; + + for Row { + chars, + ruled_out_chars, + expected_ranges, + } in table.iter() + { + let ruled_out_chars = ruled_out_chars + .into_iter() + .map(|c: &char| *c as u32) + .collect(); + let ranges = Generator::get_ranges(chars, &ruled_out_chars).collect::>(); + assert_eq!(ranges, *expected_ranges); + } + } +} diff --git a/cli/src/rules.rs b/cli/src/rules.rs new file mode 100644 index 00000000..e15070ea --- /dev/null +++ b/cli/src/rules.rs @@ -0,0 +1,234 @@ +use hashbrown::HashMap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) enum SymbolType { + External, + End, + Terminal, + NonTerminal, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) enum Associativity { + Left, + Right, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) struct Alias { + pub value: String, + pub is_named: bool, +} + +pub(crate) type AliasMap = HashMap; + +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub(crate) struct MetadataParams { + pub precedence: Option, + pub dynamic_precedence: i32, + pub associativity: Option, + pub is_token: bool, + pub is_string: bool, + pub is_active: bool, + pub is_main_token: bool, + pub alias: Option, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) struct Symbol { + pub kind: SymbolType, + pub index: usize, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub(crate) enum Rule { + Blank, + String(String), + Pattern(String), + NamedSymbol(String), + Symbol(Symbol), + Choice(Vec), + Metadata { + params: MetadataParams, + rule: Box, + }, + Repeat(Box), + Seq(Vec), +} + +impl Rule { + pub fn alias(content: Rule, value: String, is_named: bool) -> Self { + add_metadata(content, move |params| { + params.alias = Some(Alias { + is_named, + value + }); + }) + } + + pub fn token(content: Rule) -> Self { + add_metadata(content, |params| { + params.is_token = true; + }) + } + + pub fn immediate_token(content: Rule) -> Self { + add_metadata(content, |params| { + params.is_token = true; + params.is_main_token = true; + }) + } + + pub fn prec(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.precedence = Some(value); + }) + } + + pub fn prec_left(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.associativity = Some(Associativity::Left); + params.precedence = Some(value); + }) + } + + pub fn prec_right(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.associativity = Some(Associativity::Right); + params.precedence = Some(value); + }) + } + + pub fn prec_dynamic(value: i32, content: Rule) -> Self { + add_metadata(content, |params| { + params.dynamic_precedence = value; + }) + } + + pub fn repeat(rule: Rule) -> Self { + Rule::Repeat(Box::new(rule)) + } + + pub fn choice(rules: Vec) -> Self { + let mut elements = Vec::with_capacity(rules.len()); + for rule in rules { + choice_helper(&mut elements, rule); + } + Rule::Choice(elements) + } + + pub fn seq(rules: Vec) -> Self { + Rule::Seq(rules) + } +} + +#[cfg(test)] +impl Rule { + pub fn terminal(index: usize) -> Self { + Rule::Symbol(Symbol::terminal(index)) + } + + pub fn non_terminal(index: usize) -> Self { + Rule::Symbol(Symbol::non_terminal(index)) + } + + pub fn external(index: usize) -> Self { + Rule::Symbol(Symbol::external(index)) + } + + pub fn named(name: &'static str) -> Self { + Rule::NamedSymbol(name.to_string()) + } + + pub fn string(value: &'static str) -> Self { + Rule::String(value.to_string()) + } + + pub fn pattern(value: &'static str) -> Self { + Rule::Pattern(value.to_string()) + } +} + +impl Symbol { + pub fn is_terminal(&self) -> bool { + self.kind == SymbolType::Terminal + } + + pub fn is_non_terminal(&self) -> bool { + self.kind == SymbolType::NonTerminal + } + + pub fn is_external(&self) -> bool { + self.kind == SymbolType::External + } + + pub fn is_eof(&self) -> bool { + self.kind == SymbolType::End + } + + pub fn non_terminal(index: usize) -> Self { + Symbol { + kind: SymbolType::NonTerminal, + index, + } + } + + pub fn terminal(index: usize) -> Self { + Symbol { + kind: SymbolType::Terminal, + index, + } + } + + pub fn external(index: usize) -> Self { + Symbol { + kind: SymbolType::External, + index, + } + } + + pub fn end() -> Self { + Symbol { + kind: SymbolType::End, + index: 0, + } + } +} + +impl From for Rule { + fn from(symbol: Symbol) -> Self { + Rule::Symbol(symbol) + } +} + +fn add_metadata(input: Rule, f: T) -> Rule { + match input { + Rule::Metadata { rule, mut params } => { + f(&mut params); + Rule::Metadata { rule, params } + } + _ => { + let mut params = MetadataParams::default(); + f(&mut params); + Rule::Metadata { + rule: Box::new(input), + params, + } + } + } +} + +fn choice_helper(result: &mut Vec, rule: Rule) { + match rule { + Rule::Choice(elements) => { + for element in elements { + choice_helper(result, element); + } + } + _ => { + if !result.contains(&rule) { + result.push(rule); + } + } + } +} diff --git a/cli/src/tables.rs b/cli/src/tables.rs new file mode 100644 index 00000000..edbbaaab --- /dev/null +++ b/cli/src/tables.rs @@ -0,0 +1,140 @@ +use crate::nfa::CharacterSet; +use crate::rules::{Alias, Associativity, Symbol}; +use hashbrown::HashMap; + +pub(crate) type AliasSequenceId = usize; +pub(crate) type ParseStateId = usize; +pub(crate) type LexStateId = usize; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum ParseAction { + Accept, + Shift { + state: ParseStateId, + is_repetition: bool, + }, + ShiftExtra, + Recover, + Reduce { + symbol: Symbol, + child_count: usize, + precedence: i32, + dynamic_precedence: i32, + associativity: Option, + alias_sequence_id: AliasSequenceId, + }, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ParseTableEntry { + pub actions: Vec, + pub reusable: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ParseState { + pub terminal_entries: HashMap, + pub nonterminal_entries: HashMap, + pub lex_state_id: usize, + pub unfinished_item_signature: u64, +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct ParseTable { + pub states: Vec, + pub symbols: Vec, + pub alias_sequences: Vec>>, + pub max_aliased_production_length: usize, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct AdvanceAction { + pub state: Option, + pub in_main_token: bool, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub(crate) struct LexState { + pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, + pub accept_action: Option, +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexTable { + pub states: Vec, +} + +impl ParseTableEntry { + pub fn new() -> Self { + Self { + reusable: true, + actions: Vec::new(), + } + } +} + +impl Default for LexTable { + fn default() -> Self { + LexTable { states: Vec::new() } + } +} + +impl ParseState { + pub fn referenced_states<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_entries + .iter() + .flat_map(|(_, entry)| { + entry.actions.iter().filter_map(|action| match action { + ParseAction::Shift { state, .. } => Some(*state), + _ => None, + }) + }) + .chain(self.nonterminal_entries.iter().map(|(_, state)| *state)) + } + + pub fn update_referenced_states(&mut self, mut f: F) + where + F: FnMut(usize, &ParseState) -> usize, + { + let mut updates = Vec::new(); + for (symbol, entry) in &self.terminal_entries { + for (i, action) in entry.actions.iter().enumerate() { + if let ParseAction::Shift { state, .. } = action { + let result = f(*state, self); + if result != *state { + updates.push((*symbol, i, result)); + } + } + } + } + for (symbol, other_state) in &self.nonterminal_entries { + let result = f(*other_state, self); + if result != *other_state { + updates.push((*symbol, 0, result)); + } + } + for (symbol, action_index, new_state) in updates { + if symbol.is_non_terminal() { + self.nonterminal_entries.insert(symbol, new_state); + } else { + let entry = self.terminal_entries.get_mut(&symbol).unwrap(); + if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] { + entry.actions[action_index] = ParseAction::Shift { + state: new_state, + is_repetition, + }; + } + } + } + } +} + +impl ParseAction { + pub fn precedence(&self) -> i32 { + if let ParseAction::Reduce { precedence, .. } = self { + *precedence + } else { + 0 + } + } +} diff --git a/externals/bandit b/externals/bandit deleted file mode 160000 index bfdb8a33..00000000 --- a/externals/bandit +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bfdb8a3322a2e54b11aea64d84f9788d83477e83 diff --git a/externals/crypto-algorithms b/externals/crypto-algorithms deleted file mode 160000 index c7e5c23a..00000000 --- a/externals/crypto-algorithms +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c7e5c23ab04ecfb5465cbefbe17ba23d4cb3bc9d diff --git a/externals/gyp b/externals/gyp deleted file mode 160000 index e0ee72dd..00000000 --- a/externals/gyp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e0ee72ddc7fb97eb33d530cf684efcbe4d27ecb3 diff --git a/externals/json-parser b/externals/json-parser deleted file mode 160000 index 70533215..00000000 --- a/externals/json-parser +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 70533215eea575e40a0b91a34ae01a779641d73a diff --git a/lib/Cargo.toml b/lib/Cargo.toml new file mode 100644 index 00000000..e71d0c21 --- /dev/null +++ b/lib/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "tree-sitter" +description = "Rust bindings to the Tree-sitter parsing library" +version = "0.3.5" +authors = ["Max Brunsfeld "] +license = "MIT" +readme = "README.md" +keywords = ["incremental", "parsing"] +categories = ["api-bindings", "parsing", "text-editors"] + +include = [ + "/build.rs", + "/Cargo.toml", + "/LICENSE", + "/README.md", + "/src/*", + "/core/tree-sitter/externals/utf8proc/utf8proc*", + "/core/tree-sitter/include/*", + "/core/tree-sitter/src/runtime/*", +] + +[dependencies] +regex = "1" +serde = "1.0" +serde_json = "1.0" +serde_derive = "1.0" + +[build-dependencies] +cc = "1.0" + +[lib] +path = "binding/lib.rs" diff --git a/lib/README.md b/lib/README.md new file mode 100644 index 00000000..449c6c46 --- /dev/null +++ b/lib/README.md @@ -0,0 +1,98 @@ +Rust Tree-sitter +=========================== + +[![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter) +[![Build status](https://ci.appveyor.com/api/projects/status/d0f6vqq3rflxx3y6/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/rust-tree-sitter/branch/master) +[![Crates.io](https://img.shields.io/crates/v/tree-sitter.svg)](https://crates.io/crates/tree-sitter) + +Rust bindings to the [Tree-sitter][] parsing library. + +### Basic Usage + +First, create a parser: + +```rust +use tree_sitter::{Parser, Language}; + +// ... + +let mut parser = Parser::new(); +``` + +Then assign a language to the parser. Tree-sitter languages consist of generated C code. To use them from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`: + +```rust +extern "C" { fn tree_sitter_c() -> Language; } +extern "C" { fn tree_sitter_rust() -> Language; } +extern "C" { fn tree_sitter_javascript() -> Language; } + +let language = unsafe { tree_sitter_rust() }; +parser.set_language(language).unwrap(); +``` + +Now you can parse source code: + +```rust +let source_code = "fn test() {}"; +let tree = parser.parse_str(source_code, None); +let root_node = tree.root_node(); + +assert_eq!(root_node.kind(), "source_file"); +assert_eq!(root_node.start_position().column, 0); +assert_eq!(root_node.end_position().column, 12); +``` + +### Editing + +Once you have a syntax tree, you can update it when your source code changes. Passing in the previous edited tree makes `parse` run much more quickly: + +```rust +let new_source_code = "fn test(a: u32) {}" + +tree.edit(InputEdit { + start_byte: 8, + old_end_byte: 8, + new_end_byte: 14, + start_position: Point::new(0, 8), + old_end_position: Point::new(0, 8), + new_end_position: Point::new(0, 14), +}); + +let new_tree = parser.parse_str(new_source_code, Some(&tree)); +``` + +### Text Input + +The source code to parse can be provided either as a string or as a function that returns text encoded as either UTF8 or UTF16: + +```rust +// Store some source code in an array of lines. +let lines = &[ + "pub fn foo() {", + " 1", + "}", +]; + +// Parse the source code using a custom callback. The callback is called +// with both a byte offset and a row/column offset. +let tree = parser.parse_utf8(&mut |_byte: u32, position: Point| -> &[u8] { + let row = position.row as usize; + let column = position.column as usize; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } +}, None).unwrap(); + +assert_eq!( + tree.root_node().to_sexp(), + "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))" +); +``` + +[tree-sitter]: https://github.com/tree-sitter/tree-sitter diff --git a/lib/binding/bindings.rs b/lib/binding/bindings.rs new file mode 100644 index 00000000..58d0e510 --- /dev/null +++ b/lib/binding/bindings.rs @@ -0,0 +1,310 @@ +/* automatically generated by rust-bindgen */ + +pub type FILE = [u64; 19usize]; +pub type TSSymbol = u16; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLanguage { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSParser { + _unused: [u8; 0], +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTree { + _unused: [u8; 0], +} +pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0; +pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1; +pub type TSInputEncoding = u32; +pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0; +pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1; +pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2; +pub type TSSymbolType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSPoint { + pub row: u32, + pub column: u32, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSRange { + pub start_point: TSPoint, + pub end_point: TSPoint, + pub start_byte: u32, + pub end_byte: u32, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInput { + pub payload: *mut ::std::os::raw::c_void, + pub read: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + byte_index: u32, + position: TSPoint, + bytes_read: *mut u32, + ) -> *const ::std::os::raw::c_char, + >, + pub encoding: TSInputEncoding, +} +pub const TSLogType_TSLogTypeParse: TSLogType = 0; +pub const TSLogType_TSLogTypeLex: TSLogType = 1; +pub type TSLogType = u32; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSLogger { + pub payload: *mut ::std::os::raw::c_void, + pub log: ::std::option::Option< + unsafe extern "C" fn( + payload: *mut ::std::os::raw::c_void, + arg1: TSLogType, + arg2: *const ::std::os::raw::c_char, + ), + >, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSInputEdit { + pub start_byte: u32, + pub old_end_byte: u32, + pub new_end_byte: u32, + pub start_point: TSPoint, + pub old_end_point: TSPoint, + pub new_end_point: TSPoint, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSNode { + pub context: [u32; 4usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const TSTree, +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct TSTreeCursor { + pub context: [u32; 2usize], + pub id: *const ::std::os::raw::c_void, + pub tree: *const ::std::os::raw::c_void, +} +extern "C" { + pub fn ts_parser_new() -> *mut TSParser; +} +extern "C" { + pub fn ts_parser_delete(arg1: *mut TSParser); +} +extern "C" { + pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage; +} +extern "C" { + pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool; +} +extern "C" { + pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger; +} +extern "C" { + pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger); +} +extern "C" { + pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE); +} +extern "C" { + pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree; +} +extern "C" { + pub fn ts_parser_parse_string( + arg1: *mut TSParser, + arg2: *const TSTree, + arg3: *const ::std::os::raw::c_char, + arg4: u32, + ) -> *mut TSTree; +} +extern "C" { + pub fn ts_parser_enabled(arg1: *const TSParser) -> bool; +} +extern "C" { + pub fn ts_parser_set_enabled(arg1: *mut TSParser, arg2: bool); +} +extern "C" { + pub fn ts_parser_operation_limit(arg1: *const TSParser) -> usize; +} +extern "C" { + pub fn ts_parser_set_operation_limit(arg1: *mut TSParser, arg2: usize); +} +extern "C" { + pub fn ts_parser_reset(arg1: *mut TSParser); +} +extern "C" { + pub fn ts_parser_set_included_ranges(arg1: *mut TSParser, arg2: *const TSRange, arg3: u32); +} +extern "C" { + pub fn ts_parser_included_ranges(arg1: *const TSParser, arg2: *mut u32) -> *const TSRange; +} +extern "C" { + pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree; +} +extern "C" { + pub fn ts_tree_delete(arg1: *mut TSTree); +} +extern "C" { + pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode; +} +extern "C" { + pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit); +} +extern "C" { + pub fn ts_tree_get_changed_ranges( + arg1: *const TSTree, + arg2: *const TSTree, + arg3: *mut u32, + ) -> *mut TSRange; +} +extern "C" { + pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE); +} +extern "C" { + pub fn ts_tree_language(arg1: *const TSTree) -> *const TSLanguage; +} +extern "C" { + pub fn ts_node_start_byte(arg1: TSNode) -> u32; +} +extern "C" { + pub fn ts_node_start_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + pub fn ts_node_end_byte(arg1: TSNode) -> u32; +} +extern "C" { + pub fn ts_node_end_point(arg1: TSNode) -> TSPoint; +} +extern "C" { + pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol; +} +extern "C" { + pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char; +} +extern "C" { + pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char; +} +extern "C" { + pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool; +} +extern "C" { + pub fn ts_node_is_null(arg1: TSNode) -> bool; +} +extern "C" { + pub fn ts_node_is_named(arg1: TSNode) -> bool; +} +extern "C" { + pub fn ts_node_is_missing(arg1: TSNode) -> bool; +} +extern "C" { + pub fn ts_node_has_changes(arg1: TSNode) -> bool; +} +extern "C" { + pub fn ts_node_has_error(arg1: TSNode) -> bool; +} +extern "C" { + pub fn ts_node_parent(arg1: TSNode) -> TSNode; +} +extern "C" { + pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + pub fn ts_node_child_count(arg1: TSNode) -> u32; +} +extern "C" { + pub fn ts_node_named_child_count(arg1: TSNode) -> u32; +} +extern "C" { + pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode; +} +extern "C" { + pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode; +} +extern "C" { + pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode; +} +extern "C" { + pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint) + -> TSNode; +} +extern "C" { + pub fn ts_node_named_descendant_for_point_range( + arg1: TSNode, + arg2: TSPoint, + arg3: TSPoint, + ) -> TSNode; +} +extern "C" { + pub fn ts_node_edit(arg1: *mut TSNode, arg2: *const TSInputEdit); +} +extern "C" { + pub fn ts_tree_cursor_new(arg1: TSNode) -> TSTreeCursor; +} +extern "C" { + pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor); +} +extern "C" { + pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64; +} +extern "C" { + pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool; +} +extern "C" { + pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode; +} +extern "C" { + pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32; +} +extern "C" { + pub fn ts_language_symbol_name( + arg1: *const TSLanguage, + arg2: TSSymbol, + ) -> *const ::std::os::raw::c_char; +} +extern "C" { + pub fn ts_language_symbol_for_name( + arg1: *const TSLanguage, + arg2: *const ::std::os::raw::c_char, + ) -> TSSymbol; +} +extern "C" { + pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType; +} +extern "C" { + pub fn ts_language_version(arg1: *const TSLanguage) -> u32; +} + +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 9; diff --git a/lib/binding/ffi.rs b/lib/binding/ffi.rs new file mode 100644 index 00000000..323609e0 --- /dev/null +++ b/lib/binding/ffi.rs @@ -0,0 +1,4 @@ +#![allow(dead_code)] +#![allow(non_upper_case_globals)] + +include!("./bindings.rs"); diff --git a/lib/binding/lib.rs b/lib/binding/lib.rs new file mode 100644 index 00000000..65a57d16 --- /dev/null +++ b/lib/binding/lib.rs @@ -0,0 +1,1349 @@ +mod ffi; + +#[macro_use] +extern crate serde_derive; +extern crate regex; +extern crate serde; +extern crate serde_json; + +use regex::Regex; +use serde::de::DeserializeOwned; +use std::collections::HashMap; +use std::ffi::CStr; +use std::fmt; +use std::io::{self, Read, Seek}; +use std::marker::PhantomData; +use std::os::raw::{c_char, c_void}; +use std::ptr; +use std::str; + +#[derive(Clone, Copy)] +#[repr(transparent)] +pub struct Language(*const ffi::TSLanguage); + +#[derive(Debug, PartialEq, Eq)] +pub enum LogType { + Parse, + Lex, +} + +type Logger<'a> = Box; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Point { + pub row: usize, + pub column: usize, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Range { + pub start_byte: usize, + pub end_byte: usize, + pub start_point: Point, + pub end_point: Point, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct InputEdit { + pub start_byte: usize, + pub old_end_byte: usize, + pub new_end_byte: usize, + pub start_position: Point, + pub old_end_position: Point, + pub new_end_position: Point, +} + +struct PropertyTransition { + state_id: usize, + child_index: Option, + text_regex_index: Option, +} + +struct PropertyState { + transitions: HashMap>, + property_set_id: usize, + default_next_state_id: usize, +} + +#[derive(Debug)] +pub enum PropertySheetError { + InvalidJSON(serde_json::Error), + InvalidRegex(regex::Error), +} + +pub struct PropertySheet

> { + states: Vec, + property_sets: Vec

, + text_regexes: Vec, +} + +#[derive(Clone, Copy)] +pub struct Node<'a>(ffi::TSNode, PhantomData<&'a ()>); + +pub struct Parser(*mut ffi::TSParser); + +pub struct Tree(*mut ffi::TSTree); + +pub struct TreeCursor<'a>(ffi::TSTreeCursor, PhantomData<&'a ()>); + +pub struct TreePropertyCursor<'a, P> { + cursor: TreeCursor<'a>, + state_stack: Vec, + child_index_stack: Vec, + property_sheet: &'a PropertySheet

, + source: &'a str, +} + +impl Language { + pub fn node_kind_count(&self) -> usize { + unsafe { ffi::ts_language_symbol_count(self.0) as usize } + } + + pub fn node_kind_for_id(&self, id: u16) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_language_symbol_name(self.0, id)) } + .to_str() + .unwrap() + } + + pub fn node_kind_is_named(&self, id: u16) -> bool { + unsafe { ffi::ts_language_symbol_type(self.0, id) == ffi::TSSymbolType_TSSymbolTypeRegular } + } +} + +unsafe impl Send for Language {} + +unsafe impl Sync for Language {} + +impl Parser { + pub fn new() -> Parser { + unsafe { + let parser = ffi::ts_parser_new(); + Parser(parser) + } + } + + pub fn set_language(&mut self, language: Language) -> Result<(), String> { + unsafe { + let version = ffi::ts_language_version(language.0) as usize; + if version == ffi::TREE_SITTER_LANGUAGE_VERSION { + ffi::ts_parser_set_language(self.0, language.0); + Ok(()) + } else { + Err(format!( + "Incompatible language version {}. Expected {}.", + version, + ffi::TREE_SITTER_LANGUAGE_VERSION + )) + } + } + } + + pub fn logger(&self) -> Option<&Logger> { + let logger = unsafe { ffi::ts_parser_logger(self.0) }; + unsafe { (logger.payload as *mut Logger).as_ref() } + } + + pub fn set_logger(&mut self, logger: Option) { + let prev_logger = unsafe { ffi::ts_parser_logger(self.0) }; + if !prev_logger.payload.is_null() { + unsafe { Box::from_raw(prev_logger.payload as *mut Logger) }; + } + + let c_logger; + if let Some(logger) = logger { + let container = Box::new(logger); + + unsafe extern "C" fn log( + payload: *mut c_void, + c_log_type: ffi::TSLogType, + c_message: *const c_char, + ) { + let callback = (payload as *mut Logger).as_mut().unwrap(); + if let Ok(message) = CStr::from_ptr(c_message).to_str() { + let log_type = if c_log_type == ffi::TSLogType_TSLogTypeParse { + LogType::Parse + } else { + LogType::Lex + }; + callback(log_type, message); + } + }; + + let raw_container = Box::into_raw(container); + + c_logger = ffi::TSLogger { + payload: raw_container as *mut c_void, + log: Some(log), + }; + } else { + c_logger = ffi::TSLogger { + payload: ptr::null_mut(), + log: None, + }; + } + + unsafe { ffi::ts_parser_set_logger(self.0, c_logger) }; + } + + pub fn parse_str(&mut self, input: &str, old_tree: Option<&Tree>) -> Option { + let bytes = input.as_bytes(); + self.parse_utf8( + &mut |offset, _| { + if offset < bytes.len() { + &bytes[offset..] + } else { + &[] + } + }, + old_tree, + ) + } + + pub fn parse_utf8<'a, T: FnMut(usize, Point) -> &'a [u8]>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + self.parse_utf8_ptr( + &mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, + old_tree, + ) + } + + pub fn parse_utf16<'a, T: 'a + FnMut(usize, Point) -> &'a [u16]>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + self.parse_utf16_ptr( + &mut |byte, position| { + let slice = input(byte, position); + (slice.as_ptr(), slice.len()) + }, + old_tree, + ) + } + + pub fn parse_utf8_io( + &mut self, + mut input: impl Read + Seek, + old_tree: Option<&Tree>, + ) -> io::Result> { + let mut error = None; + let mut current_offset = 0; + let mut buffer = [0; 10 * 1024]; + let result = self.parse_utf8_ptr( + &mut |byte, _| { + if byte as u64 != current_offset { + current_offset = byte as u64; + if let Err(e) = input.seek(io::SeekFrom::Start(current_offset)) { + error = Some(e); + return (ptr::null(), 0); + } + } + + match input.read(&mut buffer) { + Err(e) => { + error = Some(e); + (ptr::null(), 0) + } + Ok(length) => (buffer.as_ptr(), length), + } + }, + old_tree, + ); + + match error { + Some(e) => Err(e), + None => Ok(result), + } + } + + pub fn reset(&mut self) { + unsafe { ffi::ts_parser_reset(self.0) } + } + + pub fn set_operation_limit(&mut self, limit: usize) { + unsafe { ffi::ts_parser_set_operation_limit(self.0, limit) } + } + + pub fn set_included_ranges(&mut self, ranges: &[Range]) { + let ts_ranges: Vec = + ranges.iter().cloned().map(|range| range.into()).collect(); + unsafe { + ffi::ts_parser_set_included_ranges(self.0, ts_ranges.as_ptr(), ts_ranges.len() as u32) + }; + } + + fn parse_utf8_ptr (*const u8, usize)>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + unsafe extern "C" fn read (*const u8, usize)>( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let (ptr, length) = (*input)(byte_offset as usize, position.into()); + *bytes_read = length as u32; + return ptr as *const c_char; + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF8, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { + None + } else { + Some(Tree(c_new_tree)) + } + } + + fn parse_utf16_ptr (*const u16, usize)>( + &mut self, + input: &mut T, + old_tree: Option<&Tree>, + ) -> Option { + unsafe extern "C" fn read (*const u16, usize)>( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let input = (payload as *mut T).as_mut().unwrap(); + let (ptr, length) = (*input)( + byte_offset as usize, + Point { + row: position.row as usize, + column: position.column as usize / 2, + }, + ); + *bytes_read = length as u32 * 2; + ptr as *const c_char + }; + + let c_input = ffi::TSInput { + payload: input as *mut T as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncoding_TSInputEncodingUTF16, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0); + let c_new_tree = unsafe { ffi::ts_parser_parse(self.0, c_old_tree, c_input) }; + if c_new_tree.is_null() { + None + } else { + Some(Tree(c_new_tree)) + } + } +} + +impl Drop for Parser { + fn drop(&mut self) { + self.set_logger(None); + unsafe { ffi::ts_parser_delete(self.0) } + } +} + +unsafe impl Send for Parser {} + +impl Tree { + pub fn root_node(&self) -> Node { + Node::new(unsafe { ffi::ts_tree_root_node(self.0) }).unwrap() + } + + pub fn edit(&mut self, edit: &InputEdit) { + let edit = ffi::TSInputEdit { + start_byte: edit.start_byte as u32, + old_end_byte: edit.old_end_byte as u32, + new_end_byte: edit.new_end_byte as u32, + start_point: edit.start_position.into(), + old_end_point: edit.old_end_position.into(), + new_end_point: edit.new_end_position.into(), + }; + unsafe { ffi::ts_tree_edit(self.0, &edit) }; + } + + pub fn walk(&self) -> TreeCursor { + self.root_node().walk() + } + + pub fn walk_with_properties<'a, P>( + &'a self, + property_sheet: &'a PropertySheet

, + source: &'a str, + ) -> TreePropertyCursor<'a, P> { + TreePropertyCursor::new(self, property_sheet, source) + } +} + +unsafe impl Send for Tree {} + +impl fmt::Debug for Tree { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "{{Tree {:?}}}", self.root_node()) + } +} + +impl Drop for Tree { + fn drop(&mut self) { + unsafe { ffi::ts_tree_delete(self.0) } + } +} + +impl Clone for Tree { + fn clone(&self) -> Tree { + unsafe { Tree(ffi::ts_tree_copy(self.0)) } + } +} + +impl<'tree> Node<'tree> { + fn new(node: ffi::TSNode) -> Option { + if node.id.is_null() { + None + } else { + Some(Node(node, PhantomData)) + } + } + + pub fn kind_id(&self) -> u16 { + unsafe { ffi::ts_node_symbol(self.0) } + } + + pub fn kind(&self) -> &'static str { + unsafe { CStr::from_ptr(ffi::ts_node_type(self.0)) } + .to_str() + .unwrap() + } + + pub fn is_named(&self) -> bool { + unsafe { ffi::ts_node_is_named(self.0) } + } + + pub fn has_changes(&self) -> bool { + unsafe { ffi::ts_node_has_changes(self.0) } + } + + pub fn has_error(&self) -> bool { + unsafe { ffi::ts_node_has_error(self.0) } + } + + pub fn start_byte(&self) -> usize { + unsafe { ffi::ts_node_start_byte(self.0) as usize } + } + + pub fn end_byte(&self) -> usize { + unsafe { ffi::ts_node_end_byte(self.0) as usize } + } + + pub fn range(&self) -> Range { + Range { + start_byte: self.start_byte(), + end_byte: self.end_byte(), + start_point: self.start_position(), + end_point: self.end_position(), + } + } + + pub fn start_position(&self) -> Point { + let result = unsafe { ffi::ts_node_start_point(self.0) }; + result.into() + } + + pub fn end_position(&self) -> Point { + let result = unsafe { ffi::ts_node_end_point(self.0) }; + result.into() + } + + pub fn child(&self, i: usize) -> Option { + Self::new(unsafe { ffi::ts_node_child(self.0, i as u32) }) + } + + pub fn child_count(&self) -> usize { + unsafe { ffi::ts_node_child_count(self.0) as usize } + } + + pub fn children<'a>(&'a self) -> impl Iterator> + 'a { + (0..self.child_count()) + .into_iter() + .map(move |i| self.child(i).unwrap()) + } + + pub fn named_child<'a>(&'a self, i: usize) -> Option { + Self::new(unsafe { ffi::ts_node_named_child(self.0, i as u32) }) + } + + pub fn named_child_count(&self) -> usize { + unsafe { ffi::ts_node_named_child_count(self.0) as usize } + } + + pub fn parent(&self) -> Option { + Self::new(unsafe { ffi::ts_node_parent(self.0) }) + } + + pub fn next_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_next_sibling(self.0) }) + } + + pub fn prev_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_prev_sibling(self.0) }) + } + + pub fn next_named_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_next_named_sibling(self.0) }) + } + + pub fn prev_named_sibling(&self) -> Option { + Self::new(unsafe { ffi::ts_node_prev_named_sibling(self.0) }) + } + + pub fn to_sexp(&self) -> String { + extern "C" { + fn free(pointer: *mut c_void); + } + + let c_string = unsafe { ffi::ts_node_string(self.0) }; + let result = unsafe { CStr::from_ptr(c_string) } + .to_str() + .unwrap() + .to_string(); + unsafe { free(c_string as *mut c_void) }; + result + } + + pub fn utf8_text<'a>(&self, source: &'a str) -> Result<&'a str, str::Utf8Error> { + str::from_utf8(&source.as_bytes()[self.start_byte()..self.end_byte()]) + } + + pub fn utf16_text<'a>(&self, source: &'a [u16]) -> &'a [u16] { + &source[self.start_byte()..self.end_byte()] + } + + pub fn walk(&self) -> TreeCursor<'tree> { + TreeCursor(unsafe { ffi::ts_tree_cursor_new(self.0) }, PhantomData) + } +} + +impl<'a> PartialEq for Node<'a> { + fn eq(&self, other: &Self) -> bool { + self.0.id == other.0.id + } +} + +impl<'a> fmt::Debug for Node<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!( + f, + "{{Node {} {} - {}}}", + self.kind(), + self.start_position(), + self.end_position() + ) + } +} + +impl<'a> TreeCursor<'a> { + pub fn node(&self) -> Node<'a> { + Node( + unsafe { ffi::ts_tree_cursor_current_node(&self.0) }, + PhantomData, + ) + } + + pub fn goto_first_child(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_first_child(&mut self.0) }; + } + + pub fn goto_parent(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_parent(&mut self.0) }; + } + + pub fn goto_next_sibling(&mut self) -> bool { + return unsafe { ffi::ts_tree_cursor_goto_next_sibling(&mut self.0) }; + } + + pub fn goto_first_child_for_index(&mut self, index: usize) -> Option { + let result = + unsafe { ffi::ts_tree_cursor_goto_first_child_for_byte(&mut self.0, index as u32) }; + if result < 0 { + None + } else { + Some(result as usize) + } + } +} + +impl<'a> Drop for TreeCursor<'a> { + fn drop(&mut self) { + unsafe { ffi::ts_tree_cursor_delete(&mut self.0) } + } +} + +impl<'a, P> TreePropertyCursor<'a, P> { + fn new(tree: &'a Tree, property_sheet: &'a PropertySheet

, source: &'a str) -> Self { + let mut result = Self { + cursor: tree.root_node().walk(), + child_index_stack: vec![0], + state_stack: vec![0], + property_sheet, + source, + }; + let state = result.next_state(&result.current_state(), result.cursor.node().kind_id(), 0); + result.state_stack.push(state); + result + } + + pub fn node(&self) -> Node<'a> { + self.cursor.node() + } + + pub fn node_properties(&self) -> &'a P { + &self.property_sheet.property_sets[self.current_state().property_set_id] + } + + pub fn goto_first_child(&mut self) -> bool { + if self.cursor.goto_first_child() { + let child_index = 0; + let next_state_id = { + let state = &self.current_state(); + let kind_id = self.cursor.node().kind_id(); + self.next_state(state, kind_id, child_index) + }; + self.state_stack.push(next_state_id); + self.child_index_stack.push(child_index); + true + } else { + false + } + } + + pub fn goto_next_sibling(&mut self) -> bool { + if self.cursor.goto_next_sibling() { + let child_index = self.child_index_stack.pop().unwrap() + 1; + self.state_stack.pop(); + let next_state_id = { + let state = &self.current_state(); + let kind_id = self.cursor.node().kind_id(); + self.next_state(state, kind_id, child_index) + }; + self.state_stack.push(next_state_id); + self.child_index_stack.push(child_index); + true + } else { + false + } + } + + pub fn goto_parent(&mut self) -> bool { + if self.cursor.goto_parent() { + self.state_stack.pop(); + self.child_index_stack.pop(); + true + } else { + false + } + } + + fn next_state( + &self, + state: &PropertyState, + node_kind_id: u16, + node_child_index: usize, + ) -> usize { + state + .transitions + .get(&node_kind_id) + .and_then(|transitions| { + for transition in transitions.iter() { + if let Some(text_regex_index) = transition.text_regex_index { + let node = self.cursor.node(); + let text = &self.source.as_bytes()[node.start_byte()..node.end_byte()]; + if let Ok(text) = str::from_utf8(text) { + if !self.property_sheet.text_regexes[text_regex_index].is_match(text) { + continue; + } + } + } + + if let Some(child_index) = transition.child_index { + if child_index != node_child_index { + continue; + } + } + + return Some(transition.state_id); + } + None + }) + .unwrap_or(state.default_next_state_id) + } + + fn current_state(&self) -> &PropertyState { + &self.property_sheet.states[*self.state_stack.last().unwrap()] + } +} + +impl Point { + pub fn new(row: usize, column: usize) -> Self { + Point { row, column } + } +} + +impl fmt::Display for Point { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "({}, {})", self.row, self.column) + } +} + +impl Into for Point { + fn into(self) -> ffi::TSPoint { + ffi::TSPoint { + row: self.row as u32, + column: self.column as u32, + } + } +} + +impl From for Point { + fn from(point: ffi::TSPoint) -> Self { + Self { + row: point.row as usize, + column: point.column as usize, + } + } +} + +impl Into for Range { + fn into(self) -> ffi::TSRange { + ffi::TSRange { + start_byte: self.start_byte as u32, + end_byte: self.end_byte as u32, + start_point: self.start_point.into(), + end_point: self.end_point.into(), + } + } +} + +impl

PropertySheet

{ + pub fn new(language: Language, json: &str) -> Result + where + P: DeserializeOwned, + { + #[derive(Deserialize, Debug)] + struct PropertyTransitionJSON { + #[serde(rename = "type")] + kind: String, + named: bool, + index: Option, + text: Option, + state_id: usize, + } + + #[derive(Deserialize, Debug)] + struct PropertyStateJSON { + transitions: Vec, + property_set_id: usize, + default_next_state_id: usize, + } + + #[derive(Deserialize, Debug)] + struct PropertySheetJSON

{ + states: Vec, + property_sets: Vec

, + } + + let input: PropertySheetJSON

= + serde_json::from_str(json).map_err(PropertySheetError::InvalidJSON)?; + let mut states = Vec::new(); + let mut text_regexes = Vec::new(); + let mut text_regex_patterns = Vec::new(); + + for state in input.states.iter() { + let mut transitions = HashMap::new(); + let node_kind_count = language.node_kind_count(); + for transition in state.transitions.iter() { + let text_regex_index = if let Some(regex_pattern) = transition.text.as_ref() { + if let Some(index) = + text_regex_patterns.iter().position(|r| *r == regex_pattern) + { + Some(index) + } else { + text_regex_patterns.push(regex_pattern); + text_regexes.push( + Regex::new(®ex_pattern).map_err(PropertySheetError::InvalidRegex)?, + ); + Some(text_regexes.len() - 1) + } + } else { + None + }; + + for i in 0..(node_kind_count as u16) { + if transition.kind == language.node_kind_for_id(i) + && transition.named == language.node_kind_is_named(i) + { + let entry = transitions.entry(i).or_insert(Vec::new()); + entry.push(PropertyTransition { + child_index: transition.index, + state_id: transition.state_id, + text_regex_index, + }); + } + } + } + states.push(PropertyState { + transitions, + default_next_state_id: state.default_next_state_id, + property_set_id: state.property_set_id, + }); + } + Ok(Self { + property_sets: input.property_sets, + states, + text_regexes, + }) + } + + pub fn map(self, mut f: F) -> Result, E> + where + F: FnMut(P) -> Result, + { + let mut property_sets = Vec::with_capacity(self.property_sets.len()); + for set in self.property_sets { + property_sets.push(f(set)?); + } + Ok(PropertySheet { + states: self.states, + text_regexes: self.text_regexes, + property_sets, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::thread; + + fn rust() -> Language { + unsafe { tree_sitter_rust() } + } + extern "C" { + fn tree_sitter_rust() -> Language; + } + + #[test] + fn test_basic_parsing() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let tree = parser + .parse_str( + " + struct Stuff {} + fn main() {} + ", + None, + ) + .unwrap(); + + let root_node = tree.root_node(); + assert_eq!(root_node.kind(), "source_file"); + + assert_eq!( + root_node.to_sexp(), + "(source_file (struct_item (type_identifier) (field_declaration_list)) (function_item (identifier) (parameters) (block)))" + ); + + let struct_node = root_node.child(0).unwrap(); + assert_eq!(struct_node.kind(), "struct_item"); + } + + #[test] + fn test_logging() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let mut messages = Vec::new(); + parser.set_logger(Some(Box::new(|log_type, message| { + messages.push((log_type, message.to_string())); + }))); + + parser + .parse_str( + " + struct Stuff {} + fn main() {} + ", + None, + ) + .unwrap(); + + assert!(messages.contains(&( + LogType::Parse, + "reduce sym:struct_item, child_count:3".to_string() + ))); + assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); + } + + #[test] + fn test_tree_cursor() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let tree = parser + .parse_str( + " + struct Stuff { + a: A; + b: Option, + } + ", + None, + ) + .unwrap(); + + let mut cursor = tree.walk(); + assert_eq!(cursor.node().kind(), "source_file"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct_item"); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "struct"); + assert_eq!(cursor.node().is_named(), false); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "type_identifier"); + assert_eq!(cursor.node().is_named(), true); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "field_declaration_list"); + assert_eq!(cursor.node().is_named(), true); + } + + #[test] + fn test_tree_property_matching() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let source_code = "fn f1() { f2(); }"; + let tree = parser.parse_str(source_code, None).unwrap(); + + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + reference: Option, + define: Option, + } + + let empty_properties = Properties { + reference: None, + define: None, + }; + + let property_sheet = PropertySheet::::new( + rust(), + r##" + { + "states": [ + { + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1}, + {"type": "function_item", "named": true, "state_id": 2} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [ + {"type": "identifier", "named": true, "state_id": 4} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"reference": "function"}, + {"define": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "function_item"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "fn"); + assert_eq!(*cursor.node_properties(), empty_properties); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!(cursor.node_properties().define, Some("function".to_owned())); + assert!(!cursor.goto_first_child()); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "parameters"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "("); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), ")"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_parent()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!( + cursor.node_properties().reference, + Some("function".to_owned()) + ); + } + + #[test] + fn test_tree_property_matching_with_regexes() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let source_code = "fn f1() { None(a()) }"; + let tree = parser.parse_str(source_code, None).unwrap(); + + #[derive(Debug, Deserialize, PartialEq, Eq)] + struct Properties { + scope: Option, + } + + let empty_properties = Properties { scope: None }; + + let property_sheet = PropertySheet::::new( + rust(), + r##" + { + "states": [ + { + "id": 0, + "transitions": [ + {"type": "call_expression", "named": true, "state_id": 1} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "id": 1, + "transitions": [ + {"type": "identifier", "named": true, "text": "^[A-Z]", "state_id": 2}, + {"type": "identifier", "named": true, "state_id": 3} + ], + "default_next_state_id": 0, + "property_set_id": 0 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 1 + }, + { + "transitions": [], + "default_next_state_id": 0, + "property_set_id": 2 + } + ], + "property_sets": [ + {}, + {"scope": "constructor"}, + {"scope": "function"} + ] + } + "##, + ) + .unwrap(); + + let mut cursor = tree.walk_with_properties(&property_sheet, source_code); + assert_eq!(cursor.node().kind(), "source_file"); + assert_eq!(*cursor.node_properties(), empty_properties); + + cursor.goto_first_child(); + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "block"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), "call_expression"); + assert_eq!(*cursor.node_properties(), empty_properties); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "identifier"); + assert_eq!( + cursor.node_properties().scope, + Some("constructor".to_owned()) + ); + } + + #[test] + fn test_custom_utf8_input() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let lines = &["pub fn foo() {", " 1", "}"]; + + let tree = parser + .parse_utf8( + &mut |_, position| { + let row = position.row; + let column = position.column; + if row < lines.len() { + if column < lines[row].as_bytes().len() { + &lines[row].as_bytes()[column..] + } else { + "\n".as_bytes() + } + } else { + &[] + } + }, + None, + ) + .unwrap(); + + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); + } + + #[test] + fn test_custom_utf16_input() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + parser.set_logger(Some(Box::new(|t, message| { + println!("log: {:?} {}", t, message); + }))); + + let lines: Vec> = ["pub fn foo() {", " 1", "}"] + .iter() + .map(|s| s.encode_utf16().collect()) + .collect(); + + let tree = parser + .parse_utf16( + &mut |_, position| { + let row = position.row; + let column = position.column; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &[10] + } + } else { + &[] + } + }, + None, + ) + .unwrap(); + + let root = tree.root_node(); + assert_eq!(root.to_sexp(), "(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (integer_literal))))"); + assert_eq!(root.kind(), "source_file"); + assert_eq!(root.has_error(), false); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); + } + + #[test] + fn test_node_equality() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str("struct A {}", None).unwrap(); + let node1 = tree.root_node(); + let node2 = tree.root_node(); + assert_eq!(node1, node2); + assert_eq!(node1.child(0).unwrap(), node2.child(0).unwrap()); + assert_ne!(node1.child(0).unwrap(), node2); + } + + #[test] + fn test_editing() { + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + + let mut input_bytes = "fn test(a: A, c: C) {}".as_bytes(); + let mut input_bytes_read = Vec::new(); + + let mut tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + None, + ) + .unwrap(); + + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + input_bytes_read.clear(); + input_bytes = "fn test(a: A, b: B, c: C) {}".as_bytes(); + tree.edit(&InputEdit { + start_byte: 14, + old_end_byte: 14, + new_end_byte: 20, + start_position: Point::new(0, 14), + old_end_position: Point::new(0, 14), + new_end_position: Point::new(0, 20), + }); + + let tree = parser + .parse_utf8( + &mut |offset, _| { + let offset = offset; + if offset < input_bytes.len() { + let result = &input_bytes[offset..offset + 1]; + input_bytes_read.extend(result.iter()); + result + } else { + &[] + } + }, + Some(&tree), + ) + .unwrap(); + + let parameters_sexp = tree + .root_node() + .named_child(0) + .unwrap() + .named_child(1) + .unwrap() + .to_sexp(); + assert_eq!( + parameters_sexp, + "(parameters (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)) (parameter (identifier) (type_identifier)))" + ); + + let retokenized_content = String::from_utf8(input_bytes_read).unwrap(); + assert!(retokenized_content.contains("b: B")); + assert!(!retokenized_content.contains("a: A")); + assert!(!retokenized_content.contains("c: C")); + assert!(!retokenized_content.contains("{}")); + } + + #[test] + fn test_parallel_parsing() { + // Parse this source file so that each thread has a non-trivial amount of + // work to do. + let this_file_source = include_str!("lib.rs"); + + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + let tree = parser.parse_str(this_file_source, None).unwrap(); + + let mut parse_threads = Vec::new(); + for thread_id in 1..5 { + let mut tree_clone = tree.clone(); + parse_threads.push(thread::spawn(move || { + // For each thread, prepend a different number of declarations to the + // source code. + let mut prepend_line_count = 0; + let mut prepended_source = String::new(); + for _ in 0..thread_id { + prepend_line_count += 2; + prepended_source += "struct X {}\n\n"; + } + + tree_clone.edit(&InputEdit { + start_byte: 0, + old_end_byte: 0, + new_end_byte: prepended_source.len(), + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 0), + new_end_position: Point::new(prepend_line_count, 0), + }); + prepended_source += this_file_source; + + // Reparse using the old tree as a starting point. + let mut parser = Parser::new(); + parser.set_language(rust()).unwrap(); + parser + .parse_str(&prepended_source, Some(&tree_clone)) + .unwrap() + })); + } + + // Check that the trees have the expected relationship to one another. + let trees = parse_threads + .into_iter() + .map(|thread| thread.join().unwrap()); + let child_count_differences = trees + .map(|t| t.root_node().child_count() - tree.root_node().child_count()) + .collect::>(); + + assert_eq!(child_count_differences, &[1, 2, 3, 4]); + } +} diff --git a/lib/build.rs b/lib/build.rs new file mode 100644 index 00000000..cee131bd --- /dev/null +++ b/lib/build.rs @@ -0,0 +1,26 @@ +extern crate cc; + +use std::env; +use std::path::PathBuf; + +fn main() { + let mut config = cc::Build::new(); + let src_path: PathBuf = ["src"].iter().collect(); + + config + .define("UTF8PROC_STATIC", "") + .flag_if_supported("-std=c99") + .flag_if_supported("-Wno-unused-parameter") + .include("include") + .include("utf8proc") + .file(src_path.join("runtime.c")); + + if env::var("RUST_TREE_SITTER_TEST").is_ok() { + let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect(); + config + .file(parser_dir.join("parser.c")) + .file(parser_dir.join("scanner.c")); + } + + config.compile("tree-sitter-runtime"); +} diff --git a/include/tree_sitter/compiler.h b/lib/include/tree_sitter/compiler.h similarity index 100% rename from include/tree_sitter/compiler.h rename to lib/include/tree_sitter/compiler.h diff --git a/include/tree_sitter/parser.h b/lib/include/tree_sitter/parser.h similarity index 100% rename from include/tree_sitter/parser.h rename to lib/include/tree_sitter/parser.h diff --git a/include/tree_sitter/runtime.h b/lib/include/tree_sitter/runtime.h similarity index 100% rename from include/tree_sitter/runtime.h rename to lib/include/tree_sitter/runtime.h diff --git a/src/runtime/alloc.h b/lib/src/alloc.h similarity index 100% rename from src/runtime/alloc.h rename to lib/src/alloc.h diff --git a/src/runtime/array.h b/lib/src/array.h similarity index 100% rename from src/runtime/array.h rename to lib/src/array.h diff --git a/src/runtime/atomic.h b/lib/src/atomic.h similarity index 100% rename from src/runtime/atomic.h rename to lib/src/atomic.h diff --git a/src/runtime/error_costs.h b/lib/src/error_costs.h similarity index 100% rename from src/runtime/error_costs.h rename to lib/src/error_costs.h diff --git a/src/runtime/get_changed_ranges.c b/lib/src/get_changed_ranges.c similarity index 100% rename from src/runtime/get_changed_ranges.c rename to lib/src/get_changed_ranges.c diff --git a/src/runtime/get_changed_ranges.h b/lib/src/get_changed_ranges.h similarity index 100% rename from src/runtime/get_changed_ranges.h rename to lib/src/get_changed_ranges.h diff --git a/src/runtime/language.c b/lib/src/language.c similarity index 100% rename from src/runtime/language.c rename to lib/src/language.c diff --git a/src/runtime/language.h b/lib/src/language.h similarity index 100% rename from src/runtime/language.h rename to lib/src/language.h diff --git a/src/runtime/length.h b/lib/src/length.h similarity index 100% rename from src/runtime/length.h rename to lib/src/length.h diff --git a/src/runtime/lexer.c b/lib/src/lexer.c similarity index 100% rename from src/runtime/lexer.c rename to lib/src/lexer.c diff --git a/src/runtime/lexer.h b/lib/src/lexer.h similarity index 100% rename from src/runtime/lexer.h rename to lib/src/lexer.h diff --git a/src/runtime/node.c b/lib/src/node.c similarity index 100% rename from src/runtime/node.c rename to lib/src/node.c diff --git a/src/runtime/parser.c b/lib/src/parser.c similarity index 100% rename from src/runtime/parser.c rename to lib/src/parser.c diff --git a/src/runtime/point.h b/lib/src/point.h similarity index 100% rename from src/runtime/point.h rename to lib/src/point.h diff --git a/src/runtime/reduce_action.h b/lib/src/reduce_action.h similarity index 100% rename from src/runtime/reduce_action.h rename to lib/src/reduce_action.h diff --git a/src/runtime/reusable_node.h b/lib/src/reusable_node.h similarity index 100% rename from src/runtime/reusable_node.h rename to lib/src/reusable_node.h diff --git a/src/runtime/runtime.c b/lib/src/runtime.c similarity index 100% rename from src/runtime/runtime.c rename to lib/src/runtime.c diff --git a/src/runtime/stack.c b/lib/src/stack.c similarity index 100% rename from src/runtime/stack.c rename to lib/src/stack.c diff --git a/src/runtime/stack.h b/lib/src/stack.h similarity index 100% rename from src/runtime/stack.h rename to lib/src/stack.h diff --git a/src/runtime/subtree.c b/lib/src/subtree.c similarity index 100% rename from src/runtime/subtree.c rename to lib/src/subtree.c diff --git a/src/runtime/subtree.h b/lib/src/subtree.h similarity index 100% rename from src/runtime/subtree.h rename to lib/src/subtree.h diff --git a/src/runtime/tree.c b/lib/src/tree.c similarity index 100% rename from src/runtime/tree.c rename to lib/src/tree.c diff --git a/src/runtime/tree.h b/lib/src/tree.h similarity index 100% rename from src/runtime/tree.h rename to lib/src/tree.h diff --git a/src/runtime/tree_cursor.c b/lib/src/tree_cursor.c similarity index 100% rename from src/runtime/tree_cursor.c rename to lib/src/tree_cursor.c diff --git a/src/runtime/tree_cursor.h b/lib/src/tree_cursor.h similarity index 100% rename from src/runtime/tree_cursor.h rename to lib/src/tree_cursor.h diff --git a/src/runtime/utf16.c b/lib/src/utf16.c similarity index 100% rename from src/runtime/utf16.c rename to lib/src/utf16.c diff --git a/src/runtime/utf16.h b/lib/src/utf16.h similarity index 100% rename from src/runtime/utf16.h rename to lib/src/utf16.h diff --git a/externals/utf8proc b/lib/utf8proc similarity index 100% rename from externals/utf8proc rename to lib/utf8proc diff --git a/project.gyp b/project.gyp deleted file mode 100644 index c7890373..00000000 --- a/project.gyp +++ /dev/null @@ -1,187 +0,0 @@ -{ - 'targets': [ - { - 'target_name': 'compiler', - - 'type': 'static_library', - 'include_dirs': [ - 'include', - 'src', - 'externals/utf8proc', - 'externals/json-parser', - ], - 'sources': [ - 'src/compiler/build_tables/lex_item.cc', - 'src/compiler/build_tables/lex_item_transitions.cc', - 'src/compiler/build_tables/lex_table_builder.cc', - 'src/compiler/build_tables/lookahead_set.cc', - 'src/compiler/build_tables/parse_item.cc', - 'src/compiler/build_tables/parse_item_set_builder.cc', - 'src/compiler/build_tables/parse_table_builder.cc', - 'src/compiler/build_tables/property_table_builder.cc', - 'src/compiler/build_tables/rule_can_be_blank.cc', - 'src/compiler/compile.cc', - 'src/compiler/generate_code/c_code.cc', - 'src/compiler/generate_code/property_table_json.cc', - 'src/compiler/lex_table.cc', - 'src/compiler/log.cc', - 'src/compiler/parse_json.cc', - 'src/compiler/parse_table.cc', - 'src/compiler/precedence_range.cc', - 'src/compiler/prepare_grammar/expand_repeats.cc', - 'src/compiler/prepare_grammar/expand_tokens.cc', - 'src/compiler/prepare_grammar/extract_choices.cc', - 'src/compiler/prepare_grammar/extract_simple_aliases.cc', - 'src/compiler/prepare_grammar/extract_tokens.cc', - 'src/compiler/prepare_grammar/flatten_grammar.cc', - 'src/compiler/prepare_grammar/intern_symbols.cc', - 'src/compiler/prepare_grammar/normalize_rules.cc', - 'src/compiler/prepare_grammar/parse_regex.cc', - 'src/compiler/prepare_grammar/prepare_grammar.cc', - 'src/compiler/prepare_grammar/token_description.cc', - 'src/compiler/rule.cc', - 'src/compiler/syntax_grammar.cc', - 'src/compiler/rules/character_set.cc', - 'src/compiler/rules/choice.cc', - 'src/compiler/rules/metadata.cc', - 'src/compiler/rules/repeat.cc', - 'src/compiler/rules/seq.cc', - 'src/compiler/util/string_helpers.cc', - 'externals/utf8proc/utf8proc.c', - 'externals/json-parser/json.c', - ], - 'cflags_cc': [ - '-std=c++14', - ], - 'xcode_settings': { - 'CLANG_CXX_LANGUAGE_STANDARD': 'c++14', - 'GCC_ENABLE_CPP_EXCEPTIONS': 'NO', - }, - 'direct_dependent_settings': { - 'include_dirs': [ - 'include' - ], - }, - - 'conditions': [ - # For 64-bit builds on appveyor, we need to explicitly tell gyp - # to generate an x64 target in the MSVS project file. - ['" $output_path + +echo "" >> $output_path +version_constant='TREE_SITTER_LANGUAGE_VERSION' +version_number=$(egrep "#define $version_constant (.*)" $header_path | cut -d' ' -f3) +echo "pub const $version_constant: usize = $version_number;" >> $output_path diff --git a/script/fetch-test-fixtures.cmd b/script/fetch-test-fixtures.cmd new file mode 100755 index 00000000..33543961 --- /dev/null +++ b/script/fetch-test-fixtures.cmd @@ -0,0 +1,16 @@ +@Echo off +SETLOCAL + +Set grammar_dir=fixtures\tree-sitter-rust +Set grammar_url=https://github.com/tree-sitter/tree-sitter-rust + +@IF NOT EXIST %grammar_dir% ( + git clone %grammar_url% %grammar_dir% --depth=1 +) + +pushd %grammar_dir% +git fetch origin master --depth=1 +git reset --hard origin/master +popd + +ENDLOCAL diff --git a/script/fetch-test-fixtures.sh b/script/fetch-test-fixtures.sh new file mode 100755 index 00000000..24cc316a --- /dev/null +++ b/script/fetch-test-fixtures.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +grammar_dir='fixtures/tree-sitter-rust' +grammar_url='https://github.com/tree-sitter/tree-sitter-rust' + +if [ ! -d $grammar_dir ]; then + git clone $grammar_url $grammar_dir --depth=1 +fi + +( + cd $grammar_dir; + git fetch origin master --depth=1 + git reset --hard origin/master; +) diff --git a/script/test.sh b/script/test.sh new file mode 100755 index 00000000..eb6183c0 --- /dev/null +++ b/script/test.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +RUST_TREE_SITTER_TEST=1 cargo test $@ diff --git a/src/compiler/build_tables/lex_item.cc b/src/compiler/build_tables/lex_item.cc deleted file mode 100644 index 69a16ba1..00000000 --- a/src/compiler/build_tables/lex_item.cc +++ /dev/null @@ -1,137 +0,0 @@ -#include "compiler/build_tables/lex_item.h" -#include -#include "compiler/build_tables/lex_item_transitions.h" -#include "compiler/build_tables/rule_can_be_blank.h" -#include "compiler/rule.h" -#include "compiler/util/hash_combine.h" - -namespace tree_sitter { -namespace build_tables { - -using std::map; -using std::string; -using std::unordered_set; -using rules::CharacterSet; -using rules::Symbol; -using rules::Metadata; - -LexItem::LexItem(const rules::Symbol &lhs, const rules::Rule &rule) - : lhs(lhs), rule(rule) {} - -bool LexItem::operator==(const LexItem &other) const { - return lhs == other.lhs && rule == other.rule; -} - -using CompletionStatus = LexItem::CompletionStatus; - -static CompletionStatus get_completion_status(const rules::Rule &rule) { - return rule.match( - [](rules::Choice choice) { - for (const auto &element : choice.elements) { - auto status = get_completion_status(element); - if (status.is_done) return status; - } - return CompletionStatus{false, PrecedenceRange()}; - }, - - [](rules::Metadata metadata) { - CompletionStatus result = get_completion_status(*metadata.rule); - if (result.is_done && result.precedence.empty && metadata.params.has_precedence) { - result.precedence.add(metadata.params.precedence); - } - return result; - }, - - [](rules::Repeat repeat) { - return get_completion_status(*repeat.rule); - }, - - [](rules::Seq sequence) { - CompletionStatus left_status = get_completion_status(*sequence.left); - if (left_status.is_done) { - return get_completion_status(*sequence.right); - } else { - return CompletionStatus{false, PrecedenceRange()}; - } - }, - - [](rules::Blank blank) { - return CompletionStatus{true, PrecedenceRange()}; - }, - - [](rules::CharacterSet) { - return CompletionStatus{false, PrecedenceRange()}; - }, - - [](auto) { - return CompletionStatus{false, PrecedenceRange()}; - } - ); -} - - -LexItem::CompletionStatus LexItem::completion_status() const { - return get_completion_status(rule); -} - -LexItemSet::LexItemSet() {} - -LexItemSet::LexItemSet(const unordered_set &entries) - : entries(entries) {} - -bool LexItemSet::operator==(const LexItemSet &other) const { - return entries == other.entries; -} - -bool LexItem::is_in_separators() const { - if (!rule.is()) return false; - auto &metadata = rule.get_unchecked(); - return !metadata.params.is_main_token; -} - -bool LexItemSet::has_items_in_separators() const { - for (const LexItem &item : entries) { - if (item.is_in_separators()) return true; - } - return false; -} - -LexItemSet::TransitionMap LexItemSet::transitions() const { - TransitionMap result; - for (const LexItem &item : entries) { - lex_item_transitions(&result, item); - } - return result; -} - -bool LexItemSet::Transition::operator==(const LexItemSet::Transition &other) const { - return destination == other.destination && precedence == other.precedence && - in_main_token == other.in_main_token; -} - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -using tree_sitter::util::hash_combine; -using tree_sitter::util::symmetric_hash_combine; -using tree_sitter::build_tables::LexItem; -using tree_sitter::build_tables::LexItemSet; - -size_t hash::operator()(const LexItem &item) const { - size_t result = 0; - hash_combine(&result, item.lhs.index); - hash_combine(&result, item.rule); - return result; -} - -size_t hash::operator()(const LexItemSet &item_set) const { - size_t result = 0; - hash_combine(&result, item_set.entries.size()); - for (const auto &item : item_set.entries) - symmetric_hash_combine(&result, item); - return result; -} - -} // namespace std diff --git a/src/compiler/build_tables/lex_item.h b/src/compiler/build_tables/lex_item.h deleted file mode 100644 index ac4bc487..00000000 --- a/src/compiler/build_tables/lex_item.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_LEX_ITEM_H_ -#define COMPILER_BUILD_TABLES_LEX_ITEM_H_ - -#include -#include -#include -#include -#include "compiler/rule.h" -#include "compiler/precedence_range.h" - -namespace tree_sitter { -namespace build_tables { - -class LexItem { - public: - LexItem(const rules::Symbol &, const rules::Rule &); - - struct CompletionStatus { - bool is_done; - PrecedenceRange precedence; - }; - - bool operator==(const LexItem &other) const; - CompletionStatus completion_status() const; - bool is_in_separators() const; - - rules::Symbol lhs; - rules::Rule rule; -}; - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -template <> -struct hash { - size_t operator()(const tree_sitter::build_tables::LexItem &) const; -}; - -} // namespace std - -namespace tree_sitter { -namespace build_tables { - -class LexItemSet { - public: - LexItemSet(); - explicit LexItemSet(const std::unordered_set &); - - struct Transition; - typedef std::map TransitionMap; - - bool operator==(const LexItemSet &) const; - TransitionMap transitions() const; - bool has_items_in_separators() const; - - std::unordered_set entries; -}; - -struct LexItemSet::Transition { - LexItemSet destination; - PrecedenceRange precedence; - bool in_main_token; - - bool operator==(const LexItemSet::Transition &) const; -}; - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -template <> -struct hash { - size_t operator()(const tree_sitter::build_tables::LexItemSet &) const; -}; - -} // namespace std - -#endif // COMPILER_BUILD_TABLES_LEX_ITEM_H_ diff --git a/src/compiler/build_tables/lex_item_transitions.cc b/src/compiler/build_tables/lex_item_transitions.cc deleted file mode 100644 index 7b4eb611..00000000 --- a/src/compiler/build_tables/lex_item_transitions.cc +++ /dev/null @@ -1,195 +0,0 @@ -#include "compiler/build_tables/lex_item_transitions.h" -#include -#include -#include -#include -#include "compiler/build_tables/rule_can_be_blank.h" -#include "compiler/rule.h" -#include "compiler/build_tables/lex_item.h" - -namespace tree_sitter { -namespace build_tables { - -using std::function; -using std::map; -using std::move; -using std::pair; -using std::vector; -using rules::CharacterSet; -using rules::Rule; -using Transition = LexItemSet::Transition; -using TransitionMap = LexItemSet::TransitionMap; - -class TransitionBuilder { - TransitionMap *transitions; - const rules::Symbol &item_lhs; - vector *precedence_stack; - bool in_main_token; - - inline Transition transform_transition(const Transition &transition, - const function &callback) { - LexItemSet destination; - for (const LexItem &item : transition.destination.entries) { - destination.entries.insert(LexItem(item.lhs, callback(item.rule))); - } - return Transition{destination, transition.precedence, transition.in_main_token}; - } - - void add_transition(TransitionMap *transitions, CharacterSet new_characters, - Transition new_transition) { - vector> new_entries; - - auto iter = transitions->begin(); - while (iter != transitions->end()) { - CharacterSet existing_characters = iter->first; - Transition &existing_transition = iter->second; - - CharacterSet intersecting_characters = - existing_characters.remove_set(new_characters); - if (intersecting_characters.is_empty()) { - iter++; - continue; - } - - new_characters.remove_set(intersecting_characters); - - if (!existing_characters.is_empty()) - new_entries.push_back({ - existing_characters, existing_transition, - }); - - existing_transition.destination.entries.insert( - new_transition.destination.entries.begin(), - new_transition.destination.entries.end()); - existing_transition.precedence.add(new_transition.precedence); - existing_transition.in_main_token |= new_transition.in_main_token; - - new_entries.push_back({ - intersecting_characters, existing_transition, - }); - - transitions->erase(iter++); - } - - transitions->insert(new_entries.begin(), new_entries.end()); - - if (!new_characters.is_empty()) - transitions->insert({ new_characters, new_transition }); - } - - public: - void apply(const Rule &rule) { - rule.match( - [](const rules::Blank &) {}, - - [this](const rules::CharacterSet &character_set) { - PrecedenceRange precedence; - if (!precedence_stack->empty()) { - precedence.add(precedence_stack->back()); - } - - add_transition( - transitions, - character_set, - Transition{ - LexItemSet({ LexItem(item_lhs, rules::Blank{}) }), - precedence, - in_main_token, - } - ); - }, - - [this](const rules::Choice &choice) { - for (const auto &element : choice.elements) { - apply(element); - } - }, - - [this](const rules::Seq &sequence) { - TransitionMap left_transitions; - TransitionBuilder(&left_transitions, this).apply(*sequence.left); - - for (const auto &pair : left_transitions) { - add_transition( - transitions, - pair.first, - transform_transition(pair.second, [&sequence](Rule rule) -> Rule { - return Rule::seq({rule, *sequence.right}); - }) - ); - } - - if (rule_can_be_blank(*sequence.left)) { - apply(*sequence.right); - } - }, - - [this](const rules::Repeat &repeat) { - TransitionMap content_transitions; - TransitionBuilder(&content_transitions, this).apply(*repeat.rule); - - for (const auto &pair : content_transitions) { - add_transition(transitions, pair.first, pair.second); - add_transition( - transitions, pair.first, - transform_transition(pair.second, [&repeat](Rule item_rule) { - return Rule::seq({ item_rule, repeat }); - }) - ); - } - }, - - [this](const rules::Metadata &metadata) { - bool has_active_precedence = metadata.params.is_active; - if (has_active_precedence) - precedence_stack->push_back(metadata.params.precedence); - - if (metadata.params.is_main_token) - in_main_token = true; - - auto params = metadata.params; - if (params.has_precedence) - params.is_active = true; - - TransitionMap content_transitions; - TransitionBuilder(&content_transitions, this).apply(*metadata.rule); - - for (const auto &pair : content_transitions) { - add_transition( - transitions, pair.first, - transform_transition(pair.second, [¶ms](Rule rule) { - return rules::Metadata::merge(move(rule), params); - }) - ); - } - - if (has_active_precedence) { - precedence_stack->pop_back(); - } - }, - - [](auto) {} - ); - } - - TransitionBuilder(TransitionMap *transitions, const rules::Symbol &item_lhs, - vector *precedence_stack, bool in_main_token) - : transitions(transitions), - item_lhs(item_lhs), - precedence_stack(precedence_stack), - in_main_token(in_main_token) {} - - TransitionBuilder(TransitionMap *transitions, TransitionBuilder *other) - : transitions(transitions), - item_lhs(other->item_lhs), - precedence_stack(other->precedence_stack), - in_main_token(other->in_main_token) {} -}; - -void lex_item_transitions(TransitionMap *transitions, const LexItem &item) { - vector precedence_stack; - TransitionBuilder(transitions, item.lhs, &precedence_stack, false).apply(item.rule); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/lex_item_transitions.h b/src/compiler/build_tables/lex_item_transitions.h deleted file mode 100644 index 2cd10917..00000000 --- a/src/compiler/build_tables/lex_item_transitions.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_LEX_ITEM_TRANSITIONS_H_ -#define COMPILER_BUILD_TABLES_LEX_ITEM_TRANSITIONS_H_ - -#include "compiler/build_tables/lex_item.h" - -namespace tree_sitter { -namespace build_tables { - -void lex_item_transitions(LexItemSet::TransitionMap *transitions, const LexItem &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_LEX_ITEM_TRANSITIONS_H_ diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc deleted file mode 100644 index e577d690..00000000 --- a/src/compiler/build_tables/lex_table_builder.cc +++ /dev/null @@ -1,687 +0,0 @@ -#include "compiler/build_tables/lex_table_builder.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "compiler/build_tables/lex_item.h" -#include "compiler/build_tables/lookahead_set.h" -#include "compiler/lexical_grammar.h" -#include "compiler/log.h" -#include "compiler/parse_table.h" -#include "compiler/rule.h" -#include "utf8proc.h" - -namespace tree_sitter { -namespace build_tables { - -using std::map; -using std::move; -using std::pair; -using std::set; -using std::string; -using std::vector; -using std::unordered_map; -using std::unordered_set; -using std::unique_ptr; -using std::iswalpha; -using rules::Rule; -using rules::Blank; -using rules::Choice; -using rules::CharacterSet; -using rules::Repeat; -using rules::Symbol; -using rules::Metadata; -using rules::Seq; - -bool CoincidentTokenIndex::contains(Symbol a, Symbol b) const { - return a == b || !states_with(a, b).empty(); -} - -const unordered_set &CoincidentTokenIndex::states_with(Symbol a, Symbol b) const { - static const unordered_set NO_STATES; - if (a.index > b.index) std::swap(a, b); - auto iter = entries.find({a.index, b.index}); - if (iter == entries.end()) { - return NO_STATES; - } else { - return iter->second; - } -} - -class LexTableBuilderImpl : public LexTableBuilder { - enum ConflictStatus { - DoesNotMatch = 0, - MatchesShorterStringWithinSeparators = 1 << 0, - MatchesSameString = 1 << 1, - MatchesLongerString = 1 << 2, - MatchesLongerStringWithValidNextChar = 1 << 3, - }; - - LexTable main_lex_table; - LexTable keyword_lex_table; - const LexicalGrammar grammar; - vector separator_rules; - unordered_map main_lex_state_ids; - unordered_map keyword_lex_state_ids; - CharacterSet separator_start_characters; - vector starting_characters_by_token; - vector following_characters_by_token; - const CoincidentTokenIndex &coincident_token_index; - ParseTable *parse_table; - vector conflict_matrix; - bool conflict_detection_mode; - LookaheadSet keyword_symbols; - Symbol word_token; - char encoding_buffer[8]; - - public: - LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar, - const unordered_map &following_tokens_by_token, - const CoincidentTokenIndex &coincident_token_index, - ParseTable *parse_table) - : grammar(lexical_grammar), - starting_characters_by_token(lexical_grammar.variables.size()), - following_characters_by_token(lexical_grammar.variables.size()), - coincident_token_index(coincident_token_index), - parse_table(parse_table), - conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch), - conflict_detection_mode(false), - word_token(syntax_grammar.word_token) { - - // Compute the possible separator rules and the set of separator characters that can occur - // immediately after any token. - for (const auto &rule : grammar.separators) { - separator_rules.push_back(Repeat{rule}); - add_starting_characters(&separator_start_characters, rule); - } - separator_rules.push_back(Blank{}); - - // Compute the set of characters that each token can start with and the set of non-separator - // characters that can follow each token. Also identify all of the tokens that can be - // considered 'keywords'. - LOG("characterizing tokens"); - for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol token = Symbol::terminal(i); - - add_starting_characters(&starting_characters_by_token[i], grammar.variables[i].rule); - const auto &following_tokens = following_tokens_by_token.find(token); - if (following_tokens != following_tokens_by_token.end()) { - following_tokens->second.for_each([&](Symbol following_token) { - add_starting_characters( - &following_characters_by_token[i], - grammar.variables[following_token.index].rule - ); - return true; - }); - } - } - - // For each pair of tokens, generate a lex table for just those two tokens and record what - // conflicts arise. - LOG_START("detecting conflicts between tokens"); - conflict_detection_mode = true; - for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { - for (Symbol::Index j = 0; j < i; j++) { - if (starting_characters_by_token[i].intersects(starting_characters_by_token[j]) || - starting_characters_by_token[i].intersects(separator_start_characters) || - starting_characters_by_token[j].intersects(separator_start_characters)) { - clear(); - add_lex_state(main_lex_table, item_set_for_terminals(LookaheadSet({ - Symbol::terminal(i), - Symbol::terminal(j) - }), true)); - } - } - } - LOG_END(); - - if (word_token != rules::NONE()) identify_keywords(); - } - - void identify_keywords() { - LookaheadSet homonyms; - for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) { - Symbol other_token = Symbol::terminal(j); - - // For now, only consider tokens as 'keywords' if they start with letters or underscores. - bool starts_with_letter = !starting_characters_by_token[j].includes_all; - for (auto character : starting_characters_by_token[j].included_chars) { - if (!iswalpha(character) && character != '_') { - starts_with_letter = false; - break; - } - } - if (!starts_with_letter) continue; - - if (get_conflict_status(word_token, other_token) == MatchesSameString) { - homonyms.insert(other_token); - } - } - - homonyms.for_each([&](Symbol homonym1) { - homonyms.for_each([&](Symbol homonym2) { - if (get_conflict_status(homonym1, homonym2) & MatchesSameString) { - LOG( - "conflict between homonyms %s %s", - token_name(homonym1).c_str(), - token_name(homonym2).c_str() - ); - homonyms.remove(homonym1); - } - return false; - }); - return true; - }); - - for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) { - Symbol other_token = Symbol::terminal(j); - if (other_token == word_token || homonyms.contains(other_token)) continue; - bool word_rule_shadows_other = get_conflict_status(other_token, word_token); - bool other_shadows_word_rule = get_conflict_status(word_token, other_token); - - if (word_rule_shadows_other || other_shadows_word_rule) { - homonyms.for_each([&](Symbol homonym) { - bool word_rule_was_already_present = true; - for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) { - if (!parse_table->states[state_id].has_terminal_entry(word_token)) { - word_rule_was_already_present = false; - break; - } - } - if (word_rule_was_already_present) return true; - - bool homonym_shadows_other = get_conflict_status(other_token, homonym); - bool other_shadows_homonym = get_conflict_status(homonym, other_token); - - if (word_rule_shadows_other != homonym_shadows_other) { - homonyms.remove(homonym); - LOG( - "remove %s because word_token would shadow %s", - token_name(homonym).c_str(), - token_name(other_token).c_str() - ); - } else if (other_shadows_word_rule != other_shadows_homonym) { - homonyms.remove(homonym); - LOG( - "remove %s because %s would shadow word_token", - token_name(homonym).c_str(), - token_name(other_token).c_str() - ); - } - return true; - }); - } - } - - if (!homonyms.empty()) { - LOG_START("found keywords:"); - homonyms.for_each([&](Symbol homonym) { - LOG("%s", token_name(homonym).c_str()); - return true; - }); - LOG_END(); - keyword_symbols = homonyms; - } - } - - BuildResult build() { - clear(); - conflict_detection_mode = false; - vector>> starting_token_sets; - - for (ParseState &parse_state : parse_table->states) { - LookaheadSet token_set; - for (auto &entry : parse_state.terminal_entries) { - if (word_token.is_terminal() && keyword_symbols.contains(entry.first)) { - token_set.insert(word_token); - } else { - token_set.insert(entry.first); - } - } - - bool did_merge = false; - for (auto &pair : starting_token_sets) { - if (merge_token_set(&pair.first, token_set)) { - did_merge = true; - pair.second.push_back(&parse_state); - break; - } - } - - if (!did_merge) starting_token_sets.push_back({token_set, {&parse_state}}); - } - - for (auto &pair : starting_token_sets) { - LexStateId state_id = add_lex_state(main_lex_table, item_set_for_terminals(pair.first, true)); - for (ParseState *parse_state : pair.second) { - parse_state->lex_state_id = state_id; - } - } - - add_lex_state(keyword_lex_table, item_set_for_terminals(keyword_symbols, false)); - - mark_fragile_tokens(); - remove_duplicate_lex_states(main_lex_table); - return {main_lex_table, keyword_lex_table, word_token}; - } - - bool does_token_shadow_other(Symbol token, Symbol shadowed_token) const { - if (keyword_symbols.contains(shadowed_token) && - (keyword_symbols.contains(token) || token == word_token)) return false; - return get_conflict_status(shadowed_token, token) & ( - MatchesShorterStringWithinSeparators | - MatchesLongerStringWithValidNextChar - ); - } - - bool does_token_match_same_string_as_other(Symbol token, Symbol shadowed_token) const { - if (shadowed_token == word_token && keyword_symbols.contains(token)) return false; - return get_conflict_status(shadowed_token, token) & MatchesSameString; - } - - private: - ConflictStatus get_conflict_status(Symbol shadowed_token, Symbol other_token) const { - if (shadowed_token.is_built_in() || - other_token.is_built_in() || - !shadowed_token.is_terminal() || - !other_token.is_terminal()) return DoesNotMatch; - unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index; - return conflict_matrix[index]; - } - - bool record_conflict(Symbol shadowed_token, Symbol other_token, ConflictStatus status) { - if (!conflict_detection_mode) return false; - unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index; - bool was_set = conflict_matrix[index] & status; - conflict_matrix[index] = static_cast(conflict_matrix[index] | status); - return !was_set; - } - - LexStateId add_lex_state(LexTable &lex_table, const LexItemSet &item_set) { - auto &lex_state_ids = &lex_table == &main_lex_table ? - main_lex_state_ids : - keyword_lex_state_ids; - const auto &pair = lex_state_ids.find(item_set); - if (pair == lex_state_ids.end()) { - LexStateId state_id = lex_table.states.size(); - lex_table.states.push_back(LexState()); - lex_state_ids[item_set] = state_id; - add_accept_token_actions(lex_table, item_set, state_id); - add_advance_actions(lex_table, item_set, state_id); - return state_id; - } else { - return pair->second; - } - } - - void add_advance_actions(LexTable &lex_table, const LexItemSet &item_set, LexStateId state_id) { - for (const auto &pair : item_set.transitions()) { - const CharacterSet &characters = pair.first; - const LexItemSet::Transition &transition = pair.second; - - AdvanceAction action(-1, transition.precedence, transition.in_main_token); - AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action; - if (accept_action.is_present()) { - bool prefer_advancing = action.precedence_range.max >= accept_action.precedence; - - if (conflict_detection_mode) { - bool next_item_set_can_yield_this_token = false; - for (const LexItem &item : transition.destination.entries) { - if (item.lhs == accept_action.symbol) { - next_item_set_can_yield_this_token = true; - } else if (!prefer_advancing && item_set.has_items_in_separators()) { - record_conflict(item.lhs, accept_action.symbol, MatchesShorterStringWithinSeparators); - } - } - - if (prefer_advancing && !next_item_set_can_yield_this_token) { - auto advance_symbol = transition.destination.entries.begin()->lhs; - auto &following_chars = following_characters_by_token[accept_action.symbol.index]; - CharacterSet conflicting_following_chars = characters.intersection(following_chars); - if (conflicting_following_chars.is_empty()) { - conflicting_following_chars = characters.intersection(separator_start_characters); - } - if (conflicting_following_chars.is_empty()) { - record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString); - } else { - if (record_conflict( - accept_action.symbol, - advance_symbol, - MatchesLongerStringWithValidNextChar - )) { - if (!conflicting_following_chars.included_chars.empty()) { - LOG( - "%s shadows %s followed by '%s'", - token_name(advance_symbol).c_str(), - token_name(accept_action.symbol).c_str(), - log_char(*conflicting_following_chars.included_chars.begin()) - ); - } - } - } - } - } - - if (!prefer_advancing) continue; - } - - action.state_index = add_lex_state(lex_table, transition.destination); - lex_table.states[state_id].advance_actions[characters] = action; - } - } - - void add_accept_token_actions(LexTable &lex_table, const LexItemSet &item_set, LexStateId state_id) { - for (const LexItem &item : item_set.entries) { - LexItem::CompletionStatus completion_status = item.completion_status(); - if (completion_status.is_done) { - AcceptTokenAction action(item.lhs, completion_status.precedence.max); - - if (!item.lhs.is_built_in()) { - const LexicalVariable &variable = grammar.variables[item.lhs.index]; - if (variable.is_string) action.implicit_precedence += 2; - if (is_immediate_token(variable.rule)) action.implicit_precedence += 1; - } - - AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action; - if (existing_action.is_present()) { - if (should_replace_accept_action(existing_action, action)) { - if (record_conflict(existing_action.symbol, action.symbol, MatchesSameString)) { - LOG( - "%s shadows %s - same length", - token_name(action.symbol).c_str(), - token_name(existing_action.symbol).c_str() - ); - } - } else { - if (record_conflict(action.symbol, existing_action.symbol, MatchesSameString)) { - LOG( - "%s shadows %s - same length", - token_name(existing_action.symbol).c_str(), - token_name(action.symbol).c_str() - ); - } - continue; - } - } - lex_table.states[state_id].accept_action = action; - } - } - } - - void mark_fragile_tokens() { - for (ParseState &state : parse_table->states) { - for (auto &entry : state.terminal_entries) { - Symbol token = entry.first; - if (token.is_external() || token.is_built_in()) continue; - for (unsigned i = 0; i < grammar.variables.size(); i++) { - Symbol other_token = Symbol::terminal(i); - ConflictStatus status = get_conflict_status(token, other_token); - if (status != ConflictStatus::DoesNotMatch && - state.terminal_entries.count(other_token)) { - entry.second.reusable = false; - break; - } - } - } - } - } - - bool merge_token_set(LookaheadSet *left, const LookaheadSet &right) const { - auto CannotDistinguish = ( - MatchesShorterStringWithinSeparators | - MatchesSameString | - MatchesLongerStringWithValidNextChar - ); - - bool is_compatible = true; - - left->for_each_difference(right, [&](bool in_left, Symbol different_symbol) { - if (!different_symbol.is_external() && !different_symbol.is_built_in()) { - const LookaheadSet &existing_set = in_left ? right : *left; - existing_set.for_each([&](Symbol existing_symbol) { - if ((get_conflict_status(existing_symbol, different_symbol) & CannotDistinguish) || - !coincident_token_index.contains(different_symbol, existing_symbol)) { - is_compatible = false; - return false; - } - return true; - }); - if (!is_compatible) return false; - } - - return true; - }); - - if (is_compatible) left->insert_all(right); - return is_compatible; - } - - void remove_duplicate_lex_states(LexTable &lex_table) { - for (LexState &state : lex_table.states) { - state.accept_action.precedence = 0; - state.accept_action.implicit_precedence = 0; - } - - map replacements; - - while (true) { - map duplicates; - for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { - for (LexStateId j = 0; j < i; j++) { - if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) { - duplicates.insert({ i, j }); - break; - } - } - } - - if (duplicates.empty()) break; - - map new_replacements; - for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { - LexStateId new_state_index = i; - auto duplicate = duplicates.find(i); - if (duplicate != duplicates.end()) { - new_state_index = duplicate->second; - } - - size_t prior_removed = 0; - for (const auto &duplicate : duplicates) { - if (duplicate.first >= new_state_index) break; - prior_removed++; - } - - new_state_index -= prior_removed; - new_replacements.insert({i, new_state_index}); - replacements.insert({ i, new_state_index }); - for (auto &replacement : replacements) { - if (replacement.second == i) { - replacement.second = new_state_index; - } - } - } - - for (auto &state : lex_table.states) { - for (auto &entry : state.advance_actions) { - auto new_replacement = new_replacements.find(entry.second.state_index); - if (new_replacement != new_replacements.end()) { - entry.second.state_index = new_replacement->second; - } - } - } - - for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) { - lex_table.states.erase(lex_table.states.begin() + i->first); - } - } - - for (ParseState &parse_state : parse_table->states) { - auto replacement = replacements.find(parse_state.lex_state_id); - if (replacement != replacements.end()) { - parse_state.lex_state_id = replacement->second; - } - } - } - - bool is_immediate_token(const Rule &rule) const { - return rule.match( - [](const Metadata &metadata) { - return metadata.params.is_main_token; - }, - - [](auto rule) { - return false; - } - ); - } - - LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) { - LexItemSet result; - terminals.for_each([&](Symbol symbol) { - if (symbol.is_terminal()) { - for (auto &&rule : rules_for_symbol(symbol)) { - if (with_separators && !is_immediate_token(rule)) { - for (const auto &separator_rule : separator_rules) { - result.entries.insert(LexItem( - symbol, - Metadata::separator( - Rule::seq({ - separator_rule, - Metadata::main_token(move(rule)) - }) - ) - )); - } - } else { - result.entries.insert(LexItem(symbol, Metadata::main_token(move(rule)))); - } - } - } - return true; - }); - return result; - } - - static void add_starting_characters(CharacterSet *characters, const Rule &rule) { - rule.match( - [characters](const Seq &sequence) { - add_starting_characters(characters, *sequence.left); - }, - - [characters](const rules::Choice &rule) { - for (const auto &element : rule.elements) { - add_starting_characters(characters, element); - } - }, - - [characters](const rules::Repeat &rule) { - add_starting_characters(characters, *rule.rule); - }, - - [characters](const rules::Metadata &rule) { - add_starting_characters(characters, *rule.rule); - }, - - [characters](const rules::CharacterSet &rule) { - characters->add_set(rule); - }, - - [](auto) {} - ); - } - - vector rules_for_symbol(const rules::Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) { - return { CharacterSet().include(0) }; - } - - return grammar.variables[symbol.index].rule.match( - [](const Choice &choice) { - return choice.elements; - }, - - [](auto rule) { - return vector{ rule }; - } - ); - } - - bool should_replace_accept_action(const AcceptTokenAction &old_action, - const AcceptTokenAction &new_action) { - if (new_action.precedence > old_action.precedence) return true; - if (new_action.precedence < old_action.precedence) return false; - if (new_action.implicit_precedence > old_action.implicit_precedence) return true; - if (new_action.implicit_precedence < old_action.implicit_precedence) return false; - return new_action.symbol.index < old_action.symbol.index; - } - - void clear() { - main_lex_table.states.clear(); - main_lex_state_ids.clear(); - } - - string token_name(const rules::Symbol &symbol) { - const LexicalVariable &variable = grammar.variables[symbol.index]; - if (variable.type == VariableTypeNamed) { - return variable.name; - } else { - return "'" + variable.name + "'"; - } - } - - const char *log_char(int32_t character) { - uint32_t count = utf8proc_encode_char( - character, - reinterpret_cast(encoding_buffer) - ); - encoding_buffer[count] = 0; - return encoding_buffer; - } -}; - -unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar, - const unordered_map &following_tokens, - const CoincidentTokenIndex &coincident_tokens, - ParseTable *parse_table) { - return unique_ptr(new LexTableBuilderImpl( - syntax_grammar, - lexical_grammar, - following_tokens, - coincident_tokens, - parse_table - )); -} - -LexTableBuilder::BuildResult LexTableBuilder::build() { - return static_cast(this)->build(); -} - -bool LexTableBuilder::does_token_shadow_other(Symbol a, Symbol b) const { - return static_cast(this)->does_token_shadow_other(a, b); -} - -bool LexTableBuilder::does_token_match_same_string_as_other(Symbol a, Symbol b) const { - return static_cast(this)->does_token_match_same_string_as_other(a, b); -} - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -using tree_sitter::rules::Symbol; - -size_t hash>::operator()( - const pair &p -) const { - hash hasher; - return hasher(p.first) ^ hasher(p.second); -} - -} // namespace std diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h deleted file mode 100644 index 3075b75c..00000000 --- a/src/compiler/build_tables/lex_table_builder.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ -#define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ - -#include -#include -#include -#include -#include -#include "compiler/parse_table.h" -#include "compiler/lex_table.h" - -namespace std { - -using tree_sitter::rules::Symbol; - -template <> -struct hash> { - size_t operator()(const pair &) const; -}; - -} // namespace std - -namespace tree_sitter { - -struct ParseTable; -struct SyntaxGrammar; -struct LexicalGrammar; - -namespace build_tables { - -class LookaheadSet; - -struct CoincidentTokenIndex { - std::unordered_map< - std::pair, - std::unordered_set - > entries; - - bool contains(rules::Symbol, rules::Symbol) const; - const std::unordered_set &states_with(rules::Symbol, rules::Symbol) const; -}; - -class LexTableBuilder { - public: - static std::unique_ptr create( - const SyntaxGrammar &, - const LexicalGrammar &, - const std::unordered_map &, - const CoincidentTokenIndex &, - ParseTable * - ); - - struct BuildResult { - LexTable main_table; - LexTable keyword_table; - rules::Symbol keyword_capture_token; - }; - - BuildResult build(); - bool does_token_shadow_other(rules::Symbol, rules::Symbol) const; - bool does_token_match_same_string_as_other(rules::Symbol, rules::Symbol) const; - - protected: - LexTableBuilder() = default; -}; - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc deleted file mode 100644 index 6e5f73b5..00000000 --- a/src/compiler/build_tables/lookahead_set.cc +++ /dev/null @@ -1,147 +0,0 @@ -#include "compiler/build_tables/lookahead_set.h" -#include -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace build_tables { - -using std::vector; -using rules::Symbol; - -LookaheadSet::LookaheadSet() {} - -LookaheadSet::LookaheadSet(const vector &symbols) { - for (auto symbol : symbols) insert(symbol); -} - -bool LookaheadSet::empty() const { - return terminal_bits.empty() && external_bits.empty() && !eof; -} - -bool LookaheadSet::operator==(const LookaheadSet &other) const { - return - eof == other.eof && - external_bits == other.external_bits && - terminal_bits == other.terminal_bits; -} - -bool LookaheadSet::contains(const Symbol &symbol) const { - if (symbol == rules::END_OF_INPUT()) return eof; - auto &bits = symbol.is_external() ? external_bits : terminal_bits; - return bits.size() > static_cast(symbol.index) && bits[symbol.index]; -} - -bool LookaheadSet::intersects(const LookaheadSet &other) const { - bool result = false; - for_each([&](Symbol symbol) { - if (other.contains(symbol)) { - result = true; - return false; - } - return true; - }); - return result; -} - -size_t LookaheadSet::size() const { - size_t result = 0; - for (bool bit : external_bits) if (bit) result++; - for (bool bit : terminal_bits) if (bit) result++; - if (eof) result++; - return result; -} - -bool LookaheadSet::insert_all(const LookaheadSet &other) { - bool result = false; - - if (other.eof) { - if (!eof) { - eof = true; - result = true; - } - } - - if (other.external_bits.size() > external_bits.size()) { - external_bits.resize(other.external_bits.size()); - } - - auto iter = external_bits.begin(); - auto other_iter = other.external_bits.begin(); - auto other_end = other.external_bits.end(); - while (other_iter != other_end) { - if (*other_iter && !*iter) { - result = true; - *iter = true; - } - ++iter; - ++other_iter; - } - - if (other.terminal_bits.size() > terminal_bits.size()) { - terminal_bits.resize(other.terminal_bits.size()); - } - - iter = terminal_bits.begin(); - other_iter = other.terminal_bits.begin(); - other_end = other.terminal_bits.end(); - while (other_iter != other_end) { - if (*other_iter && !*iter) { - result = true; - *iter = true; - } - ++iter; - ++other_iter; - } - - return result; -} - -bool LookaheadSet::insert(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) { - if (!eof) { - eof = true; - return true; - } - return false; - } - - auto &bits = symbol.is_external() ? external_bits : terminal_bits; - if (bits.size() <= static_cast(symbol.index)) { - bits.resize(symbol.index + 1); - } - if (!bits[symbol.index]) { - bits[symbol.index] = true; - return true; - } - return false; -} - -bool LookaheadSet::remove(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) { - if (eof) { - eof = false; - return true; - } - return false; - } - - auto &bits = symbol.is_external() ? external_bits : terminal_bits; - if (bits.size() > static_cast(symbol.index)) { - if (bits[symbol.index]) { - bits[symbol.index] = false; - return true; - } - } - - return false; -} - -void LookaheadSet::clear() { - eof = false; - terminal_bits.clear(); - external_bits.clear(); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/lookahead_set.h b/src/compiler/build_tables/lookahead_set.h deleted file mode 100644 index 6445969d..00000000 --- a/src/compiler/build_tables/lookahead_set.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_LOOKAHEAD_SET_H_ -#define COMPILER_BUILD_TABLES_LOOKAHEAD_SET_H_ - -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace build_tables { - -class LookaheadSet { - std::vector terminal_bits; - std::vector external_bits; - bool eof = false; - - public: - LookaheadSet(); - explicit LookaheadSet(const std::vector &); - - bool empty() const; - size_t size() const; - bool operator==(const LookaheadSet &) const; - bool contains(const rules::Symbol &) const; - bool insert_all(const LookaheadSet &); - bool insert(const rules::Symbol &); - bool remove(const rules::Symbol &); - void clear(); - bool intersects(const LookaheadSet &) const; - - template - void for_each(const Callback &callback) const { - for (auto begin = external_bits.begin(), - end = external_bits.end(), - iter = begin; - iter != end; - ++iter) { - if (*iter) { - if (!callback(rules::Symbol::external(iter - begin))) return; - } - } - - if (eof) { - if (!callback(rules::END_OF_INPUT())) return; - } - - for (auto begin = terminal_bits.begin(), - end = terminal_bits.end(), - iter = begin; - iter != end; - ++iter) { - if (*iter) { - if (!callback(rules::Symbol::terminal(iter - begin))) return; - } - } - } - - template - void for_each_difference(const LookaheadSet &other, const Callback &callback) const { - auto end = external_bits.end(); - auto begin = external_bits.begin(); - auto other_end = other.external_bits.end(); - auto other_begin = other.external_bits.begin(); - auto common_end = other.external_bits.size() < external_bits.size() ? - begin + other.external_bits.size() : - end; - auto iter = begin; - auto other_iter = other_begin; - for (; iter != common_end; ++iter, ++other_iter) { - if (*iter) { - if (!*other_iter && !callback(true, rules::Symbol::external(iter - begin))) return; - } else if (*other_iter) { - if (!callback(false, rules::Symbol::external(iter - begin))) return; - } - } - for (; iter < end; ++iter) { - if (*iter && !callback(true, rules::Symbol::external(iter - begin))) return; - } - for (; other_iter < other_end; ++other_iter) { - if (*other_iter && !callback(false, rules::Symbol::external(other_iter - other_begin))) return; - } - - if (eof) { - if (!other.eof && !callback(true, rules::END_OF_INPUT())) return; - } else if (other.eof) { - if (!callback(false, rules::END_OF_INPUT())) return; - } - - end = terminal_bits.end(); - begin = terminal_bits.begin(); - other_end = other.terminal_bits.end(); - other_begin = other.terminal_bits.begin(); - common_end = other.terminal_bits.size() < terminal_bits.size() ? - begin + other.terminal_bits.size() : - end; - iter = begin; - other_iter = other_begin; - for (; iter != common_end; ++iter, ++other_iter) { - if (*iter) { - if (!*other_iter && !callback(true, rules::Symbol::terminal(iter - begin))) return; - } else if (*other_iter) { - if (!callback(false, rules::Symbol::terminal(iter - begin))) return; - } - } - for (; iter < end; ++iter) { - if (*iter && !callback(true, rules::Symbol::terminal(iter - begin))) return; - } - for (; other_iter < other_end; ++other_iter) { - if (*other_iter && !callback(false, rules::Symbol::terminal(other_iter - other_begin))) return; - } - } -}; - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_LOOKAHEAD_SET_H_ diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc deleted file mode 100644 index 729cdc28..00000000 --- a/src/compiler/build_tables/parse_item.cc +++ /dev/null @@ -1,196 +0,0 @@ -#include "compiler/build_tables/parse_item.h" -#include -#include "compiler/syntax_grammar.h" -#include "compiler/rule.h" -#include "compiler/util/hash_combine.h" - -namespace tree_sitter { -namespace build_tables { - -using std::map; -using std::pair; -using std::string; -using std::to_string; -using rules::Symbol; -using rules::Associativity; -using util::hash_combine; - -ParseItem::ParseItem() : variable_index(-1), production(nullptr), step_index(0) {} - -ParseItem::ParseItem(const Symbol &lhs, const Production &production, - unsigned int step_index) - : variable_index(lhs.index), - production(&production), - step_index(step_index) {} - -bool ParseItem::operator==(const ParseItem &other) const { - if (step_index != other.step_index) return false; - if (variable_index != other.variable_index) return false; - if (production->size() != other.production->size()) return false; - for (size_t i = 0; i < step_index; i++) { - if (production->at(i).alias != other.production->at(i).alias) return false; - } - if (is_done()) { - if (!production->empty()) { - if (production->back().precedence != other.production->back().precedence) return false; - if (production->back().associativity != other.production->back().associativity) return false; - } - } else { - for (size_t i = step_index, n = production->size(); i < n; i++) { - if (production->at(i) != other.production->at(i)) return false; - } - } - return true; -} - -bool ParseItem::operator<(const ParseItem &other) const { - if (step_index < other.step_index) return true; - if (other.step_index < step_index) return false; - if (variable_index < other.variable_index) return true; - if (other.variable_index < variable_index) return false; - if (production->size() < other.production->size()) return true; - if (other.production->size() < production->size()) return false; - for (size_t i = 0; i < step_index; i++) { - if (production->at(i).alias < other.production->at(i).alias) return true; - if (other.production->at(i).alias < production->at(i).alias) return false; - } - if (is_done()) { - if (!production->empty()) { - if (production->back().precedence < other.production->back().precedence) return true; - if (other.production->back().precedence < production->back().precedence) return false; - if (production->back().associativity < other.production->back().associativity) return true; - if (other.production->back().associativity < production->back().associativity) return false; - } - } else { - for (size_t i = step_index, n = production->size(); i < n; i++) { - if (production->at(i) < other.production->at(i)) return true; - if (other.production->at(i) < production->at(i)) return false; - } - } - return false; -} - -Symbol ParseItem::lhs() const { - return Symbol{variable_index, Symbol::NonTerminal}; -} - -bool ParseItem::is_done() const { - return step_index >= production->size(); -} - -int ParseItem::precedence() const { - if (is_done()) { - if (production->empty()) { - return 0; - } else { - return production->back().precedence; - } - } else { - return production->at(step_index).precedence; - } -} - -int ParseItem::dynamic_precedence() const { - return production->dynamic_precedence; -} - -rules::Associativity ParseItem::associativity() const { - if (is_done()) { - if (production->empty()) { - return rules::AssociativityNone; - } else { - return production->back().associativity; - } - } else { - return production->at(step_index).associativity; - } -} - -Symbol ParseItem::next_symbol() const { - if (step_index >= production->size()) - return rules::NONE(); - else - return production->at(step_index).symbol; -} - -bool ParseItemSet::operator==(const ParseItemSet &other) const { - return entries == other.entries; -} - -size_t ParseItemSet::unfinished_item_signature() const { - size_t result = 0; - ParseItem previous_item; - for (auto &pair : entries) { - const ParseItem &item = pair.first; - if (item.step_index < item.production->size() && - (item.variable_index != previous_item.variable_index || - item.step_index != previous_item.step_index)) { - hash_combine(&result, item.variable_index); - hash_combine(&result, item.step_index); - previous_item = item; - } - } - return result; -} - -void ParseItemSet::add(const ParseItemSet &other) { - for (const auto &pair : other.entries) - entries[pair.first].insert_all(pair.second); -} - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -using tree_sitter::build_tables::ParseItem; -using tree_sitter::build_tables::ParseItemSet; -using tree_sitter::util::hash_combine; - -template <> -struct hash { - size_t operator()(const ParseItem &item) const { - size_t result = 0; - hash_combine(&result, item.variable_index); - hash_combine(&result, item.step_index); - hash_combine(&result, item.production->dynamic_precedence); - hash_combine(&result, item.production->size()); - for (size_t i = 0; i < item.step_index; i++) { - hash_combine(&result, item.production->at(i).alias.value); - hash_combine(&result, item.production->at(i).alias.is_named); - } - if (item.is_done()) { - if (!item.production->empty()) { - hash_combine(&result, item.production->back().precedence); - hash_combine(&result, item.production->back().associativity); - } - } else { - for (size_t i = item.step_index, n = item.production->size(); i < n; i++) { - auto &step = item.production->at(i); - hash_combine(&result, step.symbol); - hash_combine(&result, step.precedence); - hash_combine(&result, step.associativity); - } - } - return result; - } -}; - -size_t hash::operator()(const ParseItemSet &item_set) const { - size_t result = 0; - hash_combine(&result, item_set.entries.size()); - for (auto &pair : item_set.entries) { - const ParseItem &item = pair.first; - const auto &lookahead_set = pair.second; - - hash_combine(&result, item); - hash_combine(&result, lookahead_set.size()); - lookahead_set.for_each([&result](Symbol symbol) { - hash_combine(&result, symbol); - return true; - }); - } - return result; -} - -} // namespace std diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h deleted file mode 100644 index 47d14078..00000000 --- a/src/compiler/build_tables/parse_item.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_H_ -#define COMPILER_BUILD_TABLES_PARSE_ITEM_H_ - -#include -#include -#include "compiler/build_tables/lookahead_set.h" -#include "compiler/rule.h" -#include "compiler/syntax_grammar.h" -#include "compiler/precedence_range.h" - -namespace tree_sitter { -namespace build_tables { - -struct ParseItem { - ParseItem(); - ParseItem(const rules::Symbol &, const Production &, unsigned int); - - struct CompletionStatus { - bool is_done; - int precedence; - rules::Associativity associativity; - }; - - bool operator==(const ParseItem &other) const; - bool operator<(const ParseItem &other) const; - rules::Symbol lhs() const; - rules::Symbol next_symbol() const; - int precedence() const; - int dynamic_precedence() const; - rules::Associativity associativity() const; - bool is_done() const; - - int variable_index; - const Production *production; - unsigned int step_index; -}; - -struct ParseItemSet { - bool operator==(const ParseItemSet &) const; - void add(const ParseItemSet &); - size_t unfinished_item_signature() const; - - std::map entries; -}; - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -using tree_sitter::build_tables::ParseItemSet; - -template <> -struct hash { - size_t operator()(const ParseItemSet &item_set) const; -}; - -} // namespace std - -#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_H_ diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc deleted file mode 100644 index 6daa999b..00000000 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ /dev/null @@ -1,302 +0,0 @@ -#include "compiler/build_tables/parse_item_set_builder.h" -#include -#include -#include -#include -#include -#include -#include "compiler/syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace build_tables { - -using std::find; -using std::get; -using std::move; -using std::pair; -using std::set; -using std::unordered_map; -using std::vector; -using rules::Symbol; - -struct FollowSetInfo { - LookaheadSet lookaheads; - bool propagates_lookaheads; -}; - -struct NonTerminalQueueEntry { - Symbol::Index non_terminal; - LookaheadSet lookaheads; - bool propagates_lookaheads; -}; - -bool ParseItemSetBuilder::ParseItemSetComponent::operator==( - const ParseItemSetBuilder::ParseItemSetComponent &other) const { - return item == other.item && - lookaheads == other.lookaheads && - propagates_lookaheads == other.propagates_lookaheads; -} - -template -inline void find_or_push(vector &vector, const T &item) { - if (find(vector.begin(), vector.end(), item) == vector.end()) { - vector.push_back(item); - } -} - -ParseItemSetBuilder::ParseItemSetBuilder( - const SyntaxGrammar &grammar, - const LexicalGrammar &lexical_grammar -) : grammar{grammar} { - - // Populate the FIRST and LAST set of each terminal, which just contains the terminal itself. - for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - Symbol symbol = Symbol::terminal(i); - first_sets.insert({symbol, LookaheadSet({symbol})}); - last_sets.insert({symbol, LookaheadSet({symbol})}); - } - for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) { - Symbol symbol = Symbol::external(i); - first_sets.insert({symbol, LookaheadSet({symbol})}); - last_sets.insert({symbol, LookaheadSet({symbol})}); - } - - // Populate the FIRST and LAST set of each non-terminal by recursively expanding non-terminals. - vector symbols_to_process; - set processed_non_terminals; - for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol = Symbol::non_terminal(i); - LookaheadSet &first_set = first_sets[symbol]; - LookaheadSet &last_set = last_sets[symbol]; - - processed_non_terminals.clear(); - symbols_to_process.assign({symbol}); - while (!symbols_to_process.empty()) { - Symbol current_symbol = symbols_to_process.back(); - symbols_to_process.pop_back(); - - if (!current_symbol.is_non_terminal()) { - first_set.insert(current_symbol); - } else if (processed_non_terminals.insert(current_symbol.index).second) { - for (const Production &production : grammar.variables[current_symbol.index].productions) { - if (!production.empty()) { - symbols_to_process.push_back(production[0].symbol); - } - } - } - } - - processed_non_terminals.clear(); - symbols_to_process.assign({symbol}); - while (!symbols_to_process.empty()) { - Symbol current_symbol = symbols_to_process.back(); - symbols_to_process.pop_back(); - - if (!current_symbol.is_non_terminal()) { - last_set.insert(current_symbol); - } else if (processed_non_terminals.insert(current_symbol.index).second) { - for (const Production &production : grammar.variables[current_symbol.index].productions) { - if (!production.empty()) { - symbols_to_process.push_back(production.back().symbol); - } - } - } - } - } - - // Populate a cache of which ParseItems will be created when a given non-terminal is expanded. - vector non_terminal_queue; - for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { - - // Compute the follow set of each *other* non-terminal that the current non-terminal can - // start with. - unordered_map follow_set_info_by_non_terminal; - non_terminal_queue.assign({{i, LookaheadSet(), true}}); - while (!non_terminal_queue.empty()) { - NonTerminalQueueEntry queue_entry = non_terminal_queue.back(); - non_terminal_queue.pop_back(); - - bool queue_entry_is_new; - auto &follow_set_info = follow_set_info_by_non_terminal[queue_entry.non_terminal]; - if (queue_entry.propagates_lookaheads) { - queue_entry_is_new = !follow_set_info.propagates_lookaheads; - follow_set_info.propagates_lookaheads = true; - } else { - queue_entry_is_new = follow_set_info.lookaheads.insert_all(queue_entry.lookaheads); - } - - if (queue_entry_is_new) { - for (const Production &production : grammar.variables[queue_entry.non_terminal].productions) { - if (production.empty()) continue; - Symbol next_symbol = production.at(0).symbol; - if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue; - - LookaheadSet next_lookaheads; - bool propagates_lookaheads; - if (production.size() == 1) { - next_lookaheads = queue_entry.lookaheads; - propagates_lookaheads = queue_entry.propagates_lookaheads; - } else { - Symbol symbol_after_next = production.at(1).symbol; - next_lookaheads = first_sets.find(symbol_after_next)->second; - propagates_lookaheads = false; - } - - non_terminal_queue.push_back({ - next_symbol.index, - next_lookaheads, - propagates_lookaheads - }); - } - } - } - - // Use these follow sets to populate the cache of ParseItems for non-terminal `i`. - for (auto &pair : follow_set_info_by_non_terminal) { - Symbol non_terminal = Symbol::non_terminal(pair.first); - - for (const Production &production : grammar.variables[non_terminal.index].productions) { - ParseItem item(non_terminal, production, 0); - - if (grammar.variables_to_inline.count(item.next_symbol())) { - for (const Production &inlined_production : inline_production(item)) { - find_or_push(transitive_closure_component_cache[i], { - ParseItem(non_terminal, inlined_production, 0), - pair.second.lookaheads, - pair.second.propagates_lookaheads - }); - } - } else if (!grammar.variables_to_inline.count(non_terminal)) { - find_or_push(transitive_closure_component_cache[i], { - item, - pair.second.lookaheads, - pair.second.propagates_lookaheads - }); - } - } - } - } -} - -const vector &ParseItemSetBuilder::inline_production(const ParseItem &item) { - vector &result = inlined_productions_by_original_production[item]; - if (!result.empty()) return result; - - auto &inlined_step = item.production->at(item.step_index); - vector productions_to_insert; - for (auto &production : grammar.variables[inlined_step.symbol.index].productions) { - productions_to_insert.push_back(&production); - } - - for (auto iter = productions_to_insert.begin(); iter != productions_to_insert.end();) { - const Production *production = *iter; - - if (!production->empty() && grammar.variables_to_inline.count(production->steps.front().symbol)) { - iter = productions_to_insert.erase(iter); - for (auto &inlined_production : inline_production(ParseItem(inlined_step.symbol, *production, 0))) { - iter = productions_to_insert.insert(iter, &inlined_production); - } - } else { - ++iter; - } - } - - for (const Production *production_to_insert : productions_to_insert) { - auto begin = item.production->steps.begin(); - auto end = item.production->steps.end(); - auto step = begin + item.step_index; - Production production({begin, step}, item.production->dynamic_precedence); - - for (auto &step : *production_to_insert) { - production.steps.push_back(step); - if (!inlined_step.alias.value.empty()) { - production.steps.back().alias = inlined_step.alias; - } - } - - if (!production.back().precedence) { - production.back().precedence = inlined_step.precedence; - } - if (!production.back().associativity) { - production.back().associativity = inlined_step.associativity; - } - production.steps.insert( - production.steps.end(), - step + 1, - end - ); - - if (find(result.begin(), result.end(), production) == result.end()) { - result.push_back(move(production)); - } - } - - return result; -} - -void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { - for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) { - const ParseItem &item = iter->first; - const LookaheadSet &lookaheads = iter->second; - - // Items whose `step_index` is 0 are not part of the item set's "kernel"; they have been - // added in previous iterations of this loop, and they don't need to be further processed. - if (item.lhs() == rules::START() || item.step_index > 0) { - - // Kernel items whose next symbol is a non-terminal are expanded using the pre-computed - // parse item cache. - const Symbol &next_symbol = item.next_symbol(); - if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) { - - LookaheadSet next_lookaheads; - size_t next_step = item.step_index + 1; - if (next_step == item.production->size()) { - next_lookaheads = lookaheads; - } else { - Symbol symbol_after_next = item.production->at(next_step).symbol; - next_lookaheads = first_sets.find(symbol_after_next)->second; - } - - for (const auto &component : transitive_closure_component_cache[next_symbol.index]) { - LookaheadSet ¤t_lookaheads = item_set->entries[component.item]; - current_lookaheads.insert_all(component.lookaheads); - if (component.propagates_lookaheads) { - current_lookaheads.insert_all(next_lookaheads); - } - } - - if (grammar.variables_to_inline.count(next_symbol)) { - for (const Production &inlined_production : inline_production(item)) { - item_set->entries.insert({ - ParseItem(item.lhs(), inlined_production, item.step_index), - lookaheads - }); - } - - iter = item_set->entries.erase(iter); - continue; - } - } - } - - if (grammar.variables_to_inline.count(item.lhs())) { - iter = item_set->entries.erase(iter); - continue; - } - - ++iter; - } -} - -LookaheadSet ParseItemSetBuilder::get_first_set(const rules::Symbol &symbol) const { - return first_sets.find(symbol)->second; -} - -LookaheadSet ParseItemSetBuilder::get_last_set(const rules::Symbol &symbol) const { - return last_sets.find(symbol)->second; -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h deleted file mode 100644 index 3a8347e8..00000000 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_ -#define COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_ - -#include "compiler/build_tables/parse_item.h" -#include "compiler/rule.h" -#include -#include - -namespace tree_sitter { - -struct SyntaxGrammar; -struct LexicalGrammar; - -namespace build_tables { - -class ParseItemSetBuilder { - struct ParseItemSetComponent { - ParseItem item; - LookaheadSet lookaheads; - bool propagates_lookaheads; - bool operator==(const ParseItemSetComponent &) const; - }; - - const SyntaxGrammar &grammar; - std::map first_sets; - std::map last_sets; - std::map> transitive_closure_component_cache; - std::map> inlined_productions_by_original_production; - const std::vector &inline_production(const ParseItem &); - - public: - ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &); - void apply_transitive_closure(ParseItemSet *); - LookaheadSet get_first_set(const rules::Symbol &) const; - LookaheadSet get_last_set(const rules::Symbol &) const; -}; - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_ diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc deleted file mode 100644 index c5176de8..00000000 --- a/src/compiler/build_tables/parse_table_builder.cc +++ /dev/null @@ -1,960 +0,0 @@ -#include "compiler/build_tables/parse_table_builder.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "compiler/log.h" -#include "compiler/parse_table.h" -#include "compiler/build_tables/parse_item.h" -#include "compiler/build_tables/parse_item_set_builder.h" -#include "compiler/lexical_grammar.h" -#include "compiler/syntax_grammar.h" -#include "compiler/rule.h" -#include "compiler/build_tables/lex_table_builder.h" - -namespace tree_sitter { -namespace build_tables { - -using std::deque; -using std::find; -using std::vector; -using std::set; -using std::tuple; -using std::make_tuple; -using std::map; -using std::move; -using std::string; -using std::to_string; -using std::unique_ptr; -using std::unordered_map; -using rules::Associativity; -using rules::Symbol; -using rules::END_OF_INPUT; - -using SymbolSequence = vector; - -// When there are conflicts involving auxiliary nodes (repeats), -// this structure is used to find the non-auxiliary node(s) that -// had the auxliary node as a child. -struct AuxiliaryNodeInfo { - Symbol auxiliary_node; - vector parents; -}; - -struct ParseStateQueueEntry { - SymbolSequence preceding_symbols; - vector auxiliary_node_info_list; - ParseItemSet item_set; - ParseStateId state_id; -}; - -class ParseTableBuilderImpl : public ParseTableBuilder { - const SyntaxGrammar grammar; - const LexicalGrammar lexical_grammar; - const std::unordered_map &simple_aliases; - unordered_map state_ids_by_item_set; - vector item_sets_by_state_id; - deque parse_state_queue; - ParseTable parse_table; - ParseItemSetBuilder item_set_builder; - unique_ptr lex_table_builder; - unordered_map following_tokens_by_token; - CoincidentTokenIndex coincident_token_index; - set> logged_conflict_tokens; - - public: - ParseTableBuilderImpl( - const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar, - const std::unordered_map &simple_aliases - ) : grammar(syntax_grammar), - lexical_grammar(lexical_grammar), - simple_aliases(simple_aliases), - item_set_builder(syntax_grammar, lexical_grammar) {} - - BuildResult build() { - // Ensure that the empty rename sequence has index 0. - parse_table.alias_sequences.push_back({}); - - // Ensure that the error state has index 0. - ParseStateId error_state_id = add_parse_state({}, {}, ParseItemSet{}); - - // Add the starting state. - Symbol start_symbol = Symbol::non_terminal(0); - Production start_production({{start_symbol, 0, rules::AssociativityNone, rules::Alias{}}}, 0); - - add_parse_state({}, {}, ParseItemSet{{ - { - ParseItem(rules::START(), start_production, 0), - LookaheadSet({END_OF_INPUT()}), - }, - }}); - - CompileError error = process_part_state_queue(); - if (error) return { - parse_table, - LexTable(), - LexTable(), - rules::NONE(), - error, - }; - - lex_table_builder = LexTableBuilder::create( - grammar, - lexical_grammar, - following_tokens_by_token, - coincident_token_index, - &parse_table - ); - - build_error_parse_state(error_state_id); - remove_precedence_values(); - remove_duplicate_parse_states(); - eliminate_unit_reductions(); - populate_used_terminals(); - - auto lex_table_result = lex_table_builder->build(); - return { - parse_table, - lex_table_result.main_table, - lex_table_result.keyword_table, - lex_table_result.keyword_capture_token, - CompileError::none() - }; - } - - private: - CompileError process_part_state_queue() { - while (!parse_state_queue.empty()) { - auto entry = parse_state_queue.front(); - parse_state_queue.pop_front(); - - item_set_builder.apply_transitive_closure(&entry.item_set); - string conflict = add_actions( - move(entry.preceding_symbols), - move(entry.auxiliary_node_info_list), - move(entry.item_set), - entry.state_id - ); - - if (!conflict.empty()) { - return CompileError(TSCompileErrorTypeParseConflict, conflict); - } - } - - return CompileError::none(); - } - - void build_error_parse_state(ParseStateId state_id) { - parse_table.states[state_id].terminal_entries.clear(); - - // First, identify the conflict-free tokens. - LookaheadSet conflict_free_tokens; - for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { - Symbol token = Symbol::terminal(i); - bool conflicts_with_other_tokens = false; - for (unsigned j = 0; j < lexical_grammar.variables.size(); j++) { - Symbol other_token = Symbol::terminal(j); - if (!coincident_token_index.contains(token, other_token) && - lex_table_builder->does_token_shadow_other(token, other_token)) { - conflicts_with_other_tokens = true; - break; - } - } - if (!conflicts_with_other_tokens) conflict_free_tokens.insert(token); - } - - // Include in the error recover state all of the tokens that are either - // conflict-free themselves, or have no conflicts with any conflict-free - // tokens. - LOG_START("finding non-conflicting tokens for error recovery"); - LookaheadSet tokens; - for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { - Symbol token = Symbol::terminal(i); - if (conflict_free_tokens.contains(token)) { - LOG("include %s", symbol_name(token).c_str()); - parse_table.add_terminal_action(state_id, token, ParseAction::Recover()); - } else { - bool conflicts_with_other_tokens = false; - conflict_free_tokens.for_each([&](Symbol other_token) { - if (!coincident_token_index.contains(token, other_token) && - lex_table_builder->does_token_shadow_other(token, other_token)) { - LOG( - "exclude %s: conflicts with %s", - symbol_name(token).c_str(), - symbol_name(other_token).c_str() - ); - conflicts_with_other_tokens = true; - return false; - } - return true; - }); - if (!conflicts_with_other_tokens) { - LOG("include %s", symbol_name(token).c_str()); - parse_table.add_terminal_action(state_id, token, ParseAction::Recover()); - } - } - } - LOG_END(); - - for (size_t i = 0; i < grammar.external_tokens.size(); i++) { - if (grammar.external_tokens[i].corresponding_internal_token == rules::NONE()) { - parse_table.states[state_id].terminal_entries[Symbol::external(i)].actions.push_back(ParseAction::Recover()); - } - } - - parse_table.add_terminal_action(state_id, END_OF_INPUT(), ParseAction::Recover()); - } - - ParseStateId add_parse_state( - SymbolSequence &&preceding_symbols, - const vector &auxiliary_node_info_list, - const ParseItemSet &item_set - ) { - ParseStateId new_state_id = parse_table.states.size(); - auto insertion = state_ids_by_item_set.insert({move(item_set), new_state_id}); - if (insertion.second) { - item_sets_by_state_id.push_back(&insertion.first->first); - parse_table.states.push_back(ParseState()); - parse_state_queue.push_back({ - move(preceding_symbols), - auxiliary_node_info_list, - insertion.first->first, - new_state_id - }); - return new_state_id; - } else { - return insertion.first->second; - } - } - - string add_actions( - SymbolSequence &&sequence, - vector &&auxiliary_node_info_list, - ParseItemSet &&item_set, - ParseStateId state_id - ) { - map terminal_successors; - map nonterminal_successors; - set lookaheads_with_conflicts; - - for (const auto &pair : item_set.entries) { - const ParseItem &item = pair.first; - const LookaheadSet &lookahead_symbols = pair.second; - - // If the item is finished, immediately add a Reduce or Accept action to - // the parse table for each of its lookahead terminals. - if (item.is_done()) { - ParseAction action = item.lhs() == rules::START() ? - ParseAction::Accept() : - ParseAction::Reduce( - item.lhs(), - item.step_index, - item.precedence(), - item.production->dynamic_precedence, - item.associativity(), - get_alias_sequence_id(*item.production) - ); - - lookahead_symbols.for_each([&](Symbol lookahead) { - ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; - - // Only add the highest-precedence Reduce actions to the parse table. - // If other lower-precedence actions are possible, ignore them. - if (entry.actions.empty()) { - entry.actions.push_back(action); - } else { - ParseAction &existing_action = entry.actions[0]; - if (existing_action.type == ParseActionTypeAccept) { - entry.actions.push_back(action); - } else { - if (action.precedence > existing_action.precedence) { - entry.actions.assign({action}); - lookaheads_with_conflicts.erase(lookahead); - } else if (action.precedence == existing_action.precedence) { - entry.actions.push_back(action); - lookaheads_with_conflicts.insert(lookahead); - } - } - } - - return true; - }); - - // If the item is unfinished, create a new item by advancing one symbol. - // Add that new item to a successor item set. - } else { - Symbol symbol = item.production->at(item.step_index).symbol; - ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); - - if (symbol.is_non_terminal()) { - if (grammar.variables[symbol.index].type == VariableTypeAuxiliary) { - vector parents; - for (auto &item : item_set.entries) { - Symbol parent_symbol = item.first.lhs(); - if ( - item.first.next_symbol() == symbol && - grammar.variables[parent_symbol.index].type != VariableTypeAuxiliary && - !parent_symbol.is_built_in() - ) { - parents.push_back(parent_symbol); - } - } - auxiliary_node_info_list.push_back({symbol, parents}); - } - - nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols; - } else { - terminal_successors[symbol].entries[new_item] = lookahead_symbols; - } - } - } - - // Add a Shift action for each possible successor state. Shift actions for - // terminal lookaheads can conflict with Reduce actions added previously. - for (auto &pair : terminal_successors) { - Symbol lookahead = pair.first; - ParseItemSet &next_item_set = pair.second; - ParseStateId next_state_id = add_parse_state( - append_symbol(sequence, lookahead), - auxiliary_node_info_list, - next_item_set - ); - - if (!parse_table.states[state_id].terminal_entries[lookahead].actions.empty()) { - lookaheads_with_conflicts.insert(lookahead); - } - - parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id)); - } - - // Add a Shift action for each non-terminal transition. - for (auto &pair : nonterminal_successors) { - Symbol lookahead = Symbol::non_terminal(pair.first); - ParseItemSet &next_item_set = pair.second; - ParseStateId next_state_id = add_parse_state( - append_symbol(sequence, lookahead), - auxiliary_node_info_list, - next_item_set - ); - parse_table.set_nonterminal_action(state_id, lookahead.index, next_state_id); - } - - for (Symbol lookahead : lookaheads_with_conflicts) { - string conflict = handle_conflict(lookahead, item_set, sequence, auxiliary_node_info_list, state_id); - if (!conflict.empty()) return conflict; - } - - ParseAction shift_extra = ParseAction::ShiftExtra(); - ParseState &state = parse_table.states[state_id]; - for (const Symbol &extra_symbol : grammar.extra_tokens) { - if (!state.terminal_entries.count(extra_symbol) || state.has_shift_action()) { - parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); - } - } - - auto &terminals = state.terminal_entries; - for (auto iter = terminals.begin(), end = terminals.end(); iter != end; ++iter) { - if (iter->first.is_built_in() || iter->first.is_external()) continue; - for (auto other_iter = terminals.begin(); other_iter != iter; ++other_iter) { - if (other_iter->first.is_built_in() || other_iter->first.is_external()) continue; - coincident_token_index.entries[{ - other_iter->first.index, - iter->first.index - }].insert(state_id); - } - } - - return ""; - } - - void remove_precedence_values() { - for (ParseState &state : parse_table.states) { - for (auto &entry : state.terminal_entries) { - auto &actions = entry.second.actions; - - for (ParseAction &action : actions) { - action.precedence = 0; - action.associativity = rules::AssociativityNone; - } - - for (auto i = actions.begin(); i != actions.end();) { - bool erased = false; - for (auto j = actions.begin(); j != i; j++) { - if (*j == *i) { - actions.erase(i); - erased = true; - break; - } - } - if (!erased) { - ++i; - } - } - } - } - } - - void remove_duplicate_parse_states() { - LOG_START("removing duplicate parse states"); - unordered_map> state_indices_by_signature; - - for (auto &pair : state_ids_by_item_set) { - const ParseItemSet &item_set = pair.first; - ParseStateId state_id = pair.second; - state_indices_by_signature[item_set.unfinished_item_signature()].insert(state_id); - } - - set deleted_states; - - while (true) { - map state_replacements; - - for (auto &pair : state_indices_by_signature) { - auto &state_indices = pair.second; - - for (auto i = state_indices.begin(), end = state_indices.end(); i != end;) { - for (ParseStateId j : state_indices) { - if (j == *i) { - ++i; - break; - } - if (!state_replacements.count(j) && merge_parse_state(j, *i)) { - state_replacements.insert({*i, j}); - deleted_states.insert(*i); - i = state_indices.erase(i); - break; - } - } - } - } - - if (state_replacements.empty()) break; - - for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { - if (!state_replacements.count(i)) { - ParseState &state = parse_table.states[i]; - state.each_referenced_state([&state_replacements](ParseStateId *state_index) { - auto replacement = state_replacements.find(*state_index); - if (replacement != state_replacements.end()) { - *state_index = replacement->second; - } - }); - } - } - } - - delete_parse_states(deleted_states); - } - - void eliminate_unit_reductions() { - set aliased_symbols; - for (auto &variable : grammar.variables) { - for (auto &production : variable.productions) { - for (auto &step : production) { - if (!step.alias.value.empty()) { - aliased_symbols.insert(step.symbol); - } - } - } - } - - // Find all the "unit reduction states" - states whose only actions are unit reductions, - // all of which reduce by the same symbol. Store the symbols along with the state indices. - unordered_map unit_reduction_states; - for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { - ParseState &state = parse_table.states[i]; - bool only_unit_reductions = true; - Symbol::Index unit_reduction_symbol = -1; - - if (!state.nonterminal_entries.empty()) continue; - - for (auto &entry : state.terminal_entries) { - for (ParseAction &action : entry.second.actions) { - if (action.extra) continue; - if (action.type == ParseActionTypeReduce && - action.consumed_symbol_count == 1 && - action.alias_sequence_id == 0 && - !simple_aliases.count(action.symbol) && - !aliased_symbols.count(action.symbol) && - grammar.variables[action.symbol.index].type != VariableTypeNamed && - (unit_reduction_symbol == -1 || unit_reduction_symbol == action.symbol.index) - ) { - unit_reduction_symbol = action.symbol.index; - } else { - only_unit_reductions = false; - break; - } - } - - if (!only_unit_reductions) break; - } - - if (only_unit_reductions) unit_reduction_states[i] = unit_reduction_symbol; - } - - // Update each parse state so that the parser never enters these "unit reduction states". - // If a shift action points to a unit reduction state, update it to point directly at - // the same state as the shift action that's associated with the unit reduction's - // non-terminal. - for (ParseState &state : parse_table.states) { - bool done = false; - while (!done) { - done = true; - state.each_referenced_state([&](ParseStateId *state_id) { - const auto &unit_reduction_entry = unit_reduction_states.find(*state_id); - if (unit_reduction_entry != unit_reduction_states.end()) { - auto entry_for_reduced_symbol = state.nonterminal_entries.find(unit_reduction_entry->second); - *state_id = entry_for_reduced_symbol->second; - done = false; - } - }); - } - } - - // Remove the unit reduction states from the parse table. - set states_to_delete; - for (auto &entry : unit_reduction_states) { - if (entry.first != 1) states_to_delete.insert(entry.first); - } - delete_parse_states(states_to_delete); - } - - void populate_used_terminals() { - for (const ParseState &state : parse_table.states) { - for (auto &entry : state.terminal_entries) { - parse_table.symbols.insert(entry.first); - } - } - } - - // Does this parse state already have the given set of actions, for some lookahead token? - static bool has_actions(const ParseState &state, const ParseTableEntry &entry) { - for (const auto &pair : state.terminal_entries) - if (pair.second.actions == entry.actions) - return true; - return false; - } - - // Can we add the given entry into the given parse state without affecting - // the behavior of the parser for valid inputs? - bool can_add_entry_to_state(const ParseState &state, Symbol new_token, const ParseTableEntry &entry) { - // Only merge parse states by allowing existing reductions to happen - // with additional lookahead tokens. Do not alter parse states in ways - // that allow entirely new types of actions to happen. - if (entry.actions.back().type != ParseActionTypeReduce) return false; - if (!has_actions(state, entry)) return false; - - // Do not add external tokens; they could conflict lexically with any of the state's - // existing lookahead tokens. - if (new_token.is_external()) return false; - - // Do not add tokens which are both internal and external. Their validity could - // influence the behavior of the external scanner. - for (const ExternalToken &external_token : grammar.external_tokens) { - if (external_token.corresponding_internal_token == new_token) return false; - } - - // Do not add a token if it conflicts with an existing token. - if (!new_token.is_built_in()) { - for (const auto &entry : state.terminal_entries) { - if (lex_table_builder->does_token_shadow_other(new_token, entry.first) || - lex_table_builder->does_token_match_same_string_as_other(new_token, entry.first)) { - LOG_IF( - logged_conflict_tokens.insert({entry.first, new_token}).second, - "cannot merge parse states due to token conflict: %s and %s", - symbol_name(entry.first).c_str(), - symbol_name(new_token).c_str() - ); - return false; - } - } - } - - return true; - } - - // If the parse states at the given indices are mergeable, merge the second one - // into the first one. - bool merge_parse_state(size_t left_index, size_t right_index) { - ParseState &left_state = parse_table.states[left_index]; - ParseState &right_state = parse_table.states[right_index]; - if (left_state.nonterminal_entries != right_state.nonterminal_entries) return false; - - for (auto &left_entry : left_state.terminal_entries) { - Symbol lookahead = left_entry.first; - const auto &right_entry = right_state.terminal_entries.find(lookahead); - if (right_entry == right_state.terminal_entries.end()) { - if (!can_add_entry_to_state(right_state, lookahead, left_entry.second)) return false; - } else { - if (right_entry->second.actions != left_entry.second.actions) return false; - } - } - - set symbols_to_merge; - for (auto &right_entry : right_state.terminal_entries) { - Symbol lookahead = right_entry.first; - const auto &left_entry = left_state.terminal_entries.find(lookahead); - if (left_entry == left_state.terminal_entries.end()) { - if (!can_add_entry_to_state(left_state, lookahead, right_entry.second)) return false; - symbols_to_merge.insert(lookahead); - } - } - - for (const Symbol &lookahead : symbols_to_merge) { - left_state.terminal_entries[lookahead] = right_state.terminal_entries[lookahead]; - } - - return true; - } - - string handle_conflict( - Symbol lookahead, - const ParseItemSet &item_set, - const SymbolSequence &preceding_symbols, - const vector &auxiliary_node_info_list, - ParseStateId state_id - ) { - ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; - bool considered_associativity = false; - int reduction_precedence = entry.actions.front().precedence; - - PrecedenceRange shift_precedence; - set conflicting_items; - for (auto &pair : item_set.entries) { - const ParseItem &item = pair.first; - if (item.is_done()) { - if (pair.second.contains(lookahead)) { - conflicting_items.insert(item); - } - } else if (item.step_index > 0) { - LookaheadSet first_set = item_set_builder.get_first_set(item.next_symbol()); - if (first_set.contains(lookahead)) { - shift_precedence.add(item.production->at(item.step_index - 1).precedence); - conflicting_items.insert(item); - } - } - } - - if (entry.actions.back().type == ParseActionTypeShift) { - Symbol symbol = conflicting_items.begin()->lhs(); - if (symbol.is_non_terminal() && grammar.variables[symbol.index].type == VariableTypeAuxiliary) { - bool all_symbols_match = true; - for (const ParseItem &conflicting_item : conflicting_items) { - if (conflicting_item.lhs() != symbol) { - all_symbols_match = false; - break; - } - } - if (all_symbols_match) { - entry.actions.back().repetition = true; - return ""; - } - } - - // If the shift action has higher precedence, prefer it over any of the - // reduce actions. - if (shift_precedence.min > reduction_precedence || - (shift_precedence.min == reduction_precedence && - shift_precedence.max > reduction_precedence)) { - entry.actions.assign({entry.actions.back()}); - } - - // If the shift action has lower precedence, prefer the reduce actions. - else if (shift_precedence.max < reduction_precedence || - (shift_precedence.max == reduction_precedence && - shift_precedence.min < reduction_precedence)) { - entry.actions.pop_back(); - for (auto item_iter = conflicting_items.begin(); item_iter != conflicting_items.end();) { - if (item_iter->is_done()) { - ++item_iter; - } else { - item_iter = conflicting_items.erase(item_iter); - } - } - } - - // If the shift action has the same precedence as the reduce actions, - // consider the reduce actions' associativity. If they are all left - // associative, prefer the reduce actions. If they are all right - // associative, prefer the shift. - else if (shift_precedence.min == reduction_precedence && - shift_precedence.max == reduction_precedence) { - considered_associativity = true; - bool has_non_associative_reductions = false; - bool has_left_associative_reductions = false; - bool has_right_associative_reductions = false; - for (const ParseAction &action : entry.actions) { - if (action.type != ParseActionTypeReduce) break; - switch (action.associativity) { - case rules::AssociativityLeft: - has_left_associative_reductions = true; - break; - case rules::AssociativityRight: - has_right_associative_reductions = true; - break; - default: - has_non_associative_reductions = true; - break; - } - } - - if (!has_non_associative_reductions) { - if (has_right_associative_reductions && !has_left_associative_reductions) { - entry.actions.assign({entry.actions.back()}); - } else if (has_left_associative_reductions && !has_right_associative_reductions) { - entry.actions.pop_back(); - } - } - } - } - - if (entry.actions.size() == 1) return ""; - - set actual_conflict; - for (const ParseItem &item : conflicting_items) { - Symbol symbol = item.lhs(); - if (grammar.variables[symbol.index].type == VariableTypeAuxiliary) { - bool found_auxiliary_node_info = false; - for ( - auto iter = auxiliary_node_info_list.rbegin(), - end = auxiliary_node_info_list.rend(); - iter != end; - ++iter - ) { - if (iter->auxiliary_node == symbol) { - found_auxiliary_node_info = true; - actual_conflict.insert(iter->parents.begin(), iter->parents.end()); - break; - } - } - assert(found_auxiliary_node_info); - } else { - actual_conflict.insert(symbol); - } - } - - for (const auto &expected_conflict : grammar.expected_conflicts) { - if (expected_conflict == actual_conflict) return ""; - } - - string description = "Unresolved conflict for symbol sequence:\n\n"; - for (auto &symbol : preceding_symbols) { - description += " " + symbol_name(symbol); - } - - const string dot = "\xE2\x80\xA2"; - const string ellipsis = "\xE2\x80\xA6"; - - description += " " + dot + " " + symbol_name(lookahead) + " " + ellipsis; - description += "\n\n"; - description += "Possible interpretations:\n\n"; - - size_t interpretation_count = 1; - for (const ParseItem &item : conflicting_items) { - description += " " + to_string(interpretation_count++) + ":"; - - for (size_t i = 0; i < preceding_symbols.size() - item.step_index; i++) { - description += " " + symbol_name(preceding_symbols[i]); - } - - description += " (" + symbol_name(item.lhs()); - for (size_t i = 0; i < item.production->size(); i++) { - if (i == item.step_index) { - description += " " + dot; - } - description += " " + symbol_name(item.production->at(i).symbol); - } - description += ")"; - - if (item.is_done()) { - description += " " + dot + " " + symbol_name(lookahead) + " " + ellipsis; - } - - description += "\n"; - } - - description += "\nPossible resolutions:\n\n"; - - size_t resolution_count = 1; - if (actual_conflict.size() > 1) { - if (entry.actions.back().type == ParseActionTypeShift) { - description += " " + to_string(resolution_count++) + ": "; - description += "Specify a higher precedence in"; - bool is_first = true; - for (Symbol conflict_symbol : actual_conflict) { - for (const ParseItem &parse_item : conflicting_items) { - if (parse_item.lhs() == conflict_symbol && !parse_item.is_done()) { - if (!is_first) description += " and"; - description += " `" + symbol_name(conflict_symbol) + "`"; - is_first = false; - break; - } - } - } - description += " than in the other rules.\n"; - } - - for (const ParseAction &action : entry.actions) { - if (action.type == ParseActionTypeReduce) { - description += " " + to_string(resolution_count++) + ": "; - description += "Specify a higher precedence in `"; - description += symbol_name(action.symbol); - description += "` than in the other rules.\n"; - } - } - } - - if (considered_associativity) { - description += " " + to_string(resolution_count++) + ": "; - description += "Specify a left or right associativity in"; - bool is_first = true; - for (const ParseAction &action : entry.actions) { - if (action.type == ParseActionTypeReduce) { - if (!is_first) description += " and"; - description += " `" + symbol_name(action.symbol) + "`"; - is_first = false; - } - } - description += "\n"; - } - - description += " " + to_string(resolution_count++) + ": "; - description += "Add a conflict for these rules:"; - for (Symbol conflict_symbol : actual_conflict) { - description += " `" + symbol_name(conflict_symbol) + "`"; - } - description += "\n"; - return description; - } - - void delete_parse_states(const set deleted_states) { - vector new_state_ids(parse_table.states.size()); - size_t deleted_state_count = 0; - auto deleted_state_iter = deleted_states.begin(); - for (ParseStateId i = 0; i < new_state_ids.size(); i++) { - while (deleted_state_iter != deleted_states.end() && *deleted_state_iter < i) { - deleted_state_count++; - deleted_state_iter++; - } - new_state_ids[i] = i - deleted_state_count; - } - - ParseStateId original_state_index = 0; - auto iter = parse_table.states.begin(); - while (iter != parse_table.states.end()) { - if (deleted_states.count(original_state_index)) { - iter = parse_table.states.erase(iter); - } else { - ParseState &state = *iter; - state.each_referenced_state([&new_state_ids](ParseStateId *state_index) { - *state_index = new_state_ids[*state_index]; - }); - ++iter; - } - original_state_index++; - } - } - - string symbol_name(const rules::Symbol &symbol) const { - if (symbol.is_built_in()) { - if (symbol == END_OF_INPUT()) - return "END_OF_INPUT"; - else - return ""; - } - - switch (symbol.type) { - case Symbol::Terminal: { - const LexicalVariable &variable = lexical_grammar.variables[symbol.index]; - if (variable.type == VariableTypeNamed) - return variable.name; - else - return "'" + variable.name + "'"; - } - case Symbol::NonTerminal: { - return grammar.variables[symbol.index].name; - } - case Symbol::External: - default: { - return grammar.external_tokens[symbol.index].name; - } - } - } - - unsigned get_alias_sequence_id(const Production &production) { - bool has_alias = false; - AliasSequence alias_sequence; - for (unsigned i = 0, n = production.size(); i < n; i++) { - auto &step = production.at(i); - if (!step.alias.value.empty()) { - has_alias = true; - alias_sequence.resize(i + 1); - alias_sequence[i] = step.alias; - } - } - - if (has_alias && production.size() > parse_table.max_alias_sequence_length) { - parse_table.max_alias_sequence_length = production.size(); - } - - auto begin = parse_table.alias_sequences.begin(); - auto end = parse_table.alias_sequences.end(); - auto iter = find(begin, end, alias_sequence); - if (iter != end) { - return iter - begin; - } else { - parse_table.alias_sequences.push_back(move(alias_sequence)); - return parse_table.alias_sequences.size() - 1; - } - } - - SymbolSequence append_symbol(const SymbolSequence &sequence, const Symbol &symbol) { - if (!sequence.empty()) { - const LookaheadSet &left_tokens = item_set_builder.get_last_set(sequence.back()); - const LookaheadSet &right_tokens = item_set_builder.get_first_set(symbol); - - if (!left_tokens.empty() && !right_tokens.empty()) { - left_tokens.for_each([&](Symbol left_symbol) { - if (left_symbol.is_terminal() && !left_symbol.is_built_in()) { - right_tokens.for_each([&](Symbol right_symbol) { - if (right_symbol.is_terminal() && !right_symbol.is_built_in()) { - following_tokens_by_token[left_symbol].insert(right_symbol); - } - return true; - }); - } - return true; - }); - } - } - - SymbolSequence result(sequence.size() + 1); - result.assign(sequence.begin(), sequence.end()); - result.push_back(symbol); - return result; - } -}; - -unique_ptr ParseTableBuilder::create( - const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar, - const std::unordered_map &simple_aliases -) { - return unique_ptr(new ParseTableBuilderImpl( - syntax_grammar, - lexical_grammar, - simple_aliases - )); -} - -ParseTableBuilder::BuildResult ParseTableBuilder::build() { - return static_cast(this)->build(); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_table_builder.h b/src/compiler/build_tables/parse_table_builder.h deleted file mode 100644 index bfc8641f..00000000 --- a/src/compiler/build_tables/parse_table_builder.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_ -#define COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_ - -#include -#include -#include "compiler/parse_table.h" -#include "compiler/compile_error.h" - -namespace tree_sitter { - -struct ParseTable; -struct LexTable; -struct SyntaxGrammar; -struct LexicalGrammar; - -namespace build_tables { - -class ParseTableBuilder { - public: - static std::unique_ptr create( - const SyntaxGrammar &, - const LexicalGrammar &, - const std::unordered_map & - ); - - struct BuildResult { - ParseTable parse_table; - LexTable main_lex_table; - LexTable keyword_lex_table; - rules::Symbol keyword_capture_token; - CompileError error; - }; - - BuildResult build(); - - protected: - ParseTableBuilder() = default; -}; - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_ diff --git a/src/compiler/build_tables/property_table_builder.cc b/src/compiler/build_tables/property_table_builder.cc deleted file mode 100644 index 37aa6b83..00000000 --- a/src/compiler/build_tables/property_table_builder.cc +++ /dev/null @@ -1,447 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "compiler/property_sheet.h" -#include "compiler/property_table.h" -#include "compiler/build_tables/property_table_builder.h" -#include "compiler/util/hash_combine.h" - -using std::deque; -using std::vector; -using std::pair; -using std::unordered_map; -using std::set; -using std::move; -using std::map; - -namespace tree_sitter { -namespace build_tables { - -// A position within a selector for a particular rule set. -// For example, in a selector like `a > b`, this might -// describe the state of having descended into an `a`, -// but not a `b`. -struct PropertyItem { - unsigned rule_id; - unsigned selector_id; - unsigned step_id; - - bool operator==(const PropertyItem &other) const { - return - rule_id == other.rule_id && - selector_id == other.selector_id && - step_id == other.step_id; - } - - bool operator<(const PropertyItem &other) const { - if (rule_id < other.rule_id) return true; - if (rule_id > other.rule_id) return false; - if (selector_id < other.selector_id) return true; - if (selector_id > other.selector_id) return false; - return step_id < other.step_id; - } -}; - -// A set of possible positions within different selectors. -// This directly represents a state of the property-matching -// state machine. -struct PropertyItemSet { - set entries; - - bool operator==(const PropertyItemSet &other) const { - return entries == other.entries; - } -}; - -// A set of properties that matched via a certain selector. -// These are ordered according to the usual CSS rules: -// specificity, falling back to the order in the original sheet. -struct PropertySelectorMatch { - unsigned specificity; - unsigned rule_id; - unsigned selector_id; - const PropertySet *property_set; - - bool operator<(const PropertySelectorMatch &other) const { - if (specificity < other.specificity) return true; - if (specificity > other.specificity) return false; - if (rule_id < other.rule_id) return true; - if (rule_id > other.rule_id) return false; - return selector_id < other.selector_id; - } -}; - -struct PropertyTransitionEntry { - PropertyTransition transition; - unsigned latest_matching_rule_id; - - unsigned specificity() const { - return - (transition.index == -1 ? 0 : 1) + - (transition.text_pattern.empty() ? 0 : 1); - } - - // When using the final state machine, the runtime library computes - // a node's property by descending from the root of the syntax - // tree to that node. For each ancestor node on the way, it should - // update its state using the *first* matching entry of the - // `transitions` list. Therefore, the order of the transitions - // must match the normal tie-breaking rules of CSS. - bool operator<(const PropertyTransitionEntry &other) const { - // If two transitions match different node types, they can't - // both match a given node, so their order is arbitrary. - if (transition.type < other.transition.type) return true; - if (transition.type > other.transition.type) return false; - if (transition.named && !other.transition.named) return true; - if (!transition.named && other.transition.named) return false; - - // More specific transitions should be considered before less - // specific ones. - if (specificity() > other.specificity()) return true; - if (specificity() < other.specificity()) return false; - - // If there are two transitions with a specificity tie (e.g. one - // with an `:nth-child` pseudo-class and a one with a `:text` - // pseudo-class), then the one whose matching properties appeared - // later in the cascade should be considered first. - return latest_matching_rule_id > other.latest_matching_rule_id; - } -}; - -} // namespace build_tables -} // namespace tree_sitter - -namespace std { - -using tree_sitter::util::hash_combine; - -// PropertyItemSets must be hashed because in the process of building -// the table, we maintain a map of existing property item sets to -// state ids. -template <> -struct hash { - size_t operator()(const tree_sitter::build_tables::PropertyItemSet &item_set) const { - size_t result = 0; - hash_combine(&result, item_set.entries.size()); - for (const auto &item : item_set.entries) { - hash_combine(&result, item.rule_id); - hash_combine(&result, item.selector_id); - hash_combine(&result, item.step_id); - } - return result; - } -}; - -// PropertyTransitions must be hashed because we represent state -// transitions as a map of PropertyTransitions to successor PropertyItemSets. -template <> -struct hash { - size_t operator()(const tree_sitter::PropertyTransition &transition) const { - size_t result = 0; - hash_combine(&result, transition.type); - hash_combine(&result, transition.named); - hash_combine(&result, transition.index); - hash_combine(&result, transition.text_pattern); - hash_combine(&result, transition.state_id); - return result; - } -}; - -// PropertySets must be hashed so that we can use a map to dedup them. -template <> -struct hash { - size_t operator()(const tree_sitter::PropertySet &set) const { - size_t result = 0; - hash_combine(&result, set.size()); - for (const auto &pair : set) { - hash_combine(&result, pair.first); - hash_combine(&result, pair.second); - } - return result; - } -}; - -} // namespace std - -namespace tree_sitter { -namespace build_tables { - -typedef unsigned StateId; -typedef unsigned PropertySetId; - -struct PropertyTableBuilder { - PropertySheet sheet; - PropertyTable result; - unordered_map ids_by_item_set; - unordered_map ids_by_property_set; - deque> item_set_queue; - - PropertyTableBuilder(const PropertySheet &sheet) : sheet(sheet) {} - - PropertyTable build() { - PropertyItemSet start_item_set; - for (unsigned i = 0; i < sheet.size(); i++) { - PropertyRule &rule = sheet[i]; - for (unsigned j = 0; j < rule.selectors.size(); j++) { - start_item_set.entries.insert(PropertyItem {i, j, 0}); - } - } - - add_state(start_item_set); - while (!item_set_queue.empty()) { - auto entry = item_set_queue.front(); - PropertyItemSet item_set = move(entry.first); - StateId state_id = entry.second; - item_set_queue.pop_front(); - populate_state(item_set, state_id); - } - - remove_duplicate_states(); - - return result; - } - - // Different item sets can actually produce the same state, so the - // states need to be explicitly deduped as a post-processing step. - void remove_duplicate_states() { - map replacements; - - while (true) { - map duplicates; - for (StateId i = 0, size = result.states.size(); i < size; i++) { - for (StateId j = 0; j < i; j++) { - if (!duplicates.count(j) && result.states[j] == result.states[i]) { - duplicates.insert({ i, j }); - break; - } - } - } - - if (duplicates.empty()) break; - - map new_replacements; - for (StateId i = 0, size = result.states.size(); i < size; i++) { - StateId new_state_index = i; - auto duplicate = duplicates.find(i); - if (duplicate != duplicates.end()) { - new_state_index = duplicate->second; - } - - size_t prior_removed = 0; - for (const auto &duplicate : duplicates) { - if (duplicate.first >= new_state_index) break; - prior_removed++; - } - - new_state_index -= prior_removed; - new_replacements.insert({i, new_state_index}); - replacements.insert({ i, new_state_index }); - for (auto &replacement : replacements) { - if (replacement.second == i) { - replacement.second = new_state_index; - } - } - } - - for (auto &state : result.states) { - for (auto &transition : state.transitions) { - auto new_replacement = new_replacements.find(transition.state_id); - if (new_replacement != new_replacements.end()) { - transition.state_id = new_replacement->second; - } - } - - auto new_replacement = new_replacements.find(state.default_next_state_id); - if (new_replacement != new_replacements.end()) { - state.default_next_state_id = new_replacement->second; - } - } - - for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) { - result.states.erase(result.states.begin() + i->first); - } - } - } - - // Get the next part of the selector that needs to be matched for a given item. - // This returns null if the item has consumed its entire selector. - const PropertySelectorStep *next_step_for_item(const PropertyItem &item) { - const PropertySelector &selector = sheet[item.rule_id].selectors[item.selector_id]; - if (item.step_id < selector.size()) { - return &selector[item.step_id]; - } else { - return nullptr; - } - } - - // Get the previous part of the selector that was matched for a given item. - // This returns null if the item has not consumed anything. - const PropertySelectorStep *prev_step_for_item(const PropertyItem &item) { - if (item.step_id > 0) { - return &sheet[item.rule_id].selectors[item.selector_id][item.step_id]; - } else { - return nullptr; - } - } - - unsigned specificity_for_selector(const PropertySelector &selector) { - unsigned result = selector.size(); - for (const PropertySelectorStep &step : selector) { - if (step.index != -1) result++; - if (!step.text_pattern.empty()) result++; - } - return result; - } - - // Check if the given state transition matches the given part of a selector. - bool step_matches_transition(const PropertySelectorStep &step, const PropertyTransition &transition) { - return - step.type == transition.type && - step.named == transition.named && - (step.index == transition.index || step.index == -1) && - (step.text_pattern == transition.text_pattern || step.text_pattern.empty()); - } - - void populate_state(const PropertyItemSet &item_set, StateId state_id) { - unordered_map transitions; - vector selector_matches; - - for (const PropertyItem &item : item_set.entries) { - const PropertySelectorStep *next_step = next_step_for_item(item); - - // If this item has more elements to match for its selector, then - // there's a state transition for elements that match the next - // part of the selector. - if (next_step) { - transitions[PropertyTransition{ - next_step->type, - next_step->named, - next_step->index, - next_step->text_pattern, - 0 - }] = PropertyItemSet(); - } - - // If the item has matched its entire selector, then the property set - // for the item's rule applies in this state. - else { - const PropertyRule &rule = sheet[item.rule_id]; - selector_matches.push_back(PropertySelectorMatch { - specificity_for_selector(rule.selectors[item.selector_id]), - item.rule_id, - item.selector_id, - &rule.properties, - }); - } - } - - // For each element that follows an item in this set, - // compute the next item set after descending through that element. - vector transition_list; - for (auto &pair : transitions) { - PropertyTransition transition = pair.first; - PropertyItemSet &next_item_set = pair.second; - unsigned latest_matching_rule_id = 0; - - for (const PropertyItem &item : item_set.entries) { - const PropertySelectorStep *next_step = next_step_for_item(item); - const PropertySelectorStep *prev_step = prev_step_for_item(item); - if (next_step) { - - // If the element matches the next part of the item, advance the - // item to the next part of its selector. - if (step_matches_transition(*next_step, transition)) { - PropertyItem next_item = item; - next_item.step_id++; - next_item_set.entries.insert(next_item); - - // If the item is at the end of its selector, record its rule id - // so that it can be used when sorting the transitions. - if (!next_step_for_item(next_item) && next_item.rule_id > latest_matching_rule_id) { - latest_matching_rule_id = item.rule_id; - } - } - - // If the element does not match, and the item is in the middle - // of an immediate child selector, then remove it from the - // next item set. Otherwise, keep it unchanged. - if (!prev_step || !prev_step->is_immediate) { - next_item_set.entries.insert(item); - } - } - } - - transition.state_id = add_state(next_item_set); - transition_list.push_back(PropertyTransitionEntry {transition, latest_matching_rule_id}); - } - - std::sort(transition_list.begin(), transition_list.end()); - for (auto &entry : transition_list) { - result.states[state_id].transitions.push_back(entry.transition); - } - - // Compute the default successor item set - the item set that - // we should advance to if the next element doesn't match any - // of the next elements in the item set's selectors. - PropertyItemSet default_next_item_set; - for (const PropertyItem &item : item_set.entries) { - const PropertySelectorStep *next_step = next_step_for_item(item); - const PropertySelectorStep *prev_step = prev_step_for_item(item); - if (next_step && (!prev_step || !prev_step->is_immediate)) { - default_next_item_set.entries.insert(item); - } - } - - StateId default_next_state_id = add_state(default_next_item_set); - result.states[state_id].default_next_state_id = default_next_state_id; - - // Sort the matching property sets by ascending specificity and by - // their order in the sheet. This way, more specific selectors and later - // rules will override less specific selectors and earlier rules. - PropertySet properties; - std::sort(selector_matches.begin(), selector_matches.end()); - for (auto &match : selector_matches) { - for (auto &pair : *match.property_set) { - properties[pair.first] = pair.second; - } - } - - // Add the final property set to the deduped list. - result.states[state_id].property_set_id = add_property_set(properties); - } - - StateId add_state(const PropertyItemSet &item_set) { - auto entry = ids_by_item_set.find(item_set); - if (entry == ids_by_item_set.end()) { - StateId id = result.states.size(); - ids_by_item_set[item_set] = id; - result.states.push_back(PropertyState {}); - item_set_queue.push_back({item_set, id}); - return id; - } else { - return entry->second; - } - } - - PropertySetId add_property_set(const PropertySet &property_set) { - auto entry = ids_by_property_set.find(property_set); - if (entry == ids_by_property_set.end()) { - PropertySetId id = result.property_sets.size(); - ids_by_property_set[property_set] = id; - result.property_sets.push_back(property_set); - return id; - } else { - return entry->second; - } - } -}; - -PropertyTable build_property_table(const PropertySheet &sheet) { - return PropertyTableBuilder(sheet).build(); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/property_table_builder.h b/src/compiler/build_tables/property_table_builder.h deleted file mode 100644 index 25b94ce7..00000000 --- a/src/compiler/build_tables/property_table_builder.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_PROPERTY_TABLE_BUILDER_H_ -#define COMPILER_BUILD_TABLES_PROPERTY_TABLE_BUILDER_H_ - -#include -#include "compiler/property_table.h" - -namespace tree_sitter { -namespace build_tables { - -PropertyTable build_property_table(const PropertySheet &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_PROPERTY_TABLE_BUILDER_H_ diff --git a/src/compiler/build_tables/rule_can_be_blank.cc b/src/compiler/build_tables/rule_can_be_blank.cc deleted file mode 100644 index 97737fd3..00000000 --- a/src/compiler/build_tables/rule_can_be_blank.cc +++ /dev/null @@ -1,43 +0,0 @@ -#include "compiler/build_tables/rule_can_be_blank.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace build_tables { - -bool rule_can_be_blank(const rules::Rule &rule) { - return rule.match( - [](rules::Blank) { - return true; - }, - - [](rules::CharacterSet) { - return false; - }, - - [](rules::Repeat repeat) { - return rule_can_be_blank(*repeat.rule); - }, - - [](rules::Metadata metadata) { - return rule_can_be_blank(*metadata.rule); - }, - - [](rules::Choice choice) { - for (const auto &element : choice.elements) { - if (rule_can_be_blank(element)) { - return true; - } - } - return false; - }, - - [](rules::Seq seq) { - return rule_can_be_blank(*seq.left) && rule_can_be_blank(*seq.right); - }, - - [](auto) { return false; } - ); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/rule_can_be_blank.h b/src/compiler/build_tables/rule_can_be_blank.h deleted file mode 100644 index 768dc6df..00000000 --- a/src/compiler/build_tables/rule_can_be_blank.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_RULE_CAN_BE_BLANK_H_ -#define COMPILER_BUILD_TABLES_RULE_CAN_BE_BLANK_H_ - -#include "compiler/rule.h" - -namespace tree_sitter { -namespace build_tables { - -bool rule_can_be_blank(const rules::Rule &rule); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_RULE_CAN_BE_BLANK_H_ diff --git a/src/compiler/compile.cc b/src/compiler/compile.cc deleted file mode 100644 index 4a9bd663..00000000 --- a/src/compiler/compile.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include "tree_sitter/compiler.h" -#include "compiler/prepare_grammar/prepare_grammar.h" -#include "compiler/build_tables/parse_table_builder.h" -#include "compiler/build_tables/property_table_builder.h" -#include "compiler/generate_code/c_code.h" -#include "compiler/generate_code/property_table_json.h" -#include "compiler/syntax_grammar.h" -#include "compiler/log.h" -#include "compiler/lexical_grammar.h" -#include "compiler/parse_json.h" -#include "json.h" - -namespace tree_sitter { - -using std::move; -using std::pair; -using std::string; -using std::vector; -using std::get; -using std::make_tuple; - -extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file) { - set_log_file(log_file); - - ParseGrammarResult parse_result = parse_grammar_json(string(input)); - if (!parse_result.error_message.empty()) { - return {nullptr, strdup(parse_result.error_message.c_str()), TSCompileErrorTypeInvalidGrammar}; - } - - auto prepare_grammar_result = prepare_grammar::prepare_grammar(parse_result.grammar); - SyntaxGrammar &syntax_grammar = prepare_grammar_result.syntax_grammar; - LexicalGrammar &lexical_grammar = prepare_grammar_result.lexical_grammar; - auto &simple_aliases = prepare_grammar_result.simple_aliases; - CompileError error = prepare_grammar_result.error; - if (error.type) { - return {nullptr, strdup(error.message.c_str()), error.type}; - } - - auto builder = build_tables::ParseTableBuilder::create( - syntax_grammar, - lexical_grammar, - simple_aliases - ); - auto build_tables_result = builder->build(); - error = build_tables_result.error; - if (error.type != 0) { - return {nullptr, strdup(error.message.c_str()), error.type}; - } - - string code = generate_code::c_code( - parse_result.name, - move(build_tables_result.parse_table), - move(build_tables_result.main_lex_table), - move(build_tables_result.keyword_lex_table), - build_tables_result.keyword_capture_token, - move(syntax_grammar), - move(lexical_grammar), - move(simple_aliases) - ); - - set_log_file(nullptr); - return {strdup(code.c_str()), nullptr, TSCompileErrorTypeNone}; -} - -extern "C" TSCompileResult ts_compile_property_sheet(const char *input, FILE *log_file) { - set_log_file(log_file); - auto parse_result = parse_property_sheet_json(string(input)); - if (!parse_result.ok()) { - return {nullptr, strdup(parse_result.error.c_str()), TSCompileErrorTypeInvalidGrammar}; - } - PropertyTable table = build_tables::build_property_table(parse_result.value); - string code = generate_code::property_table_json(table); - return {strdup(code.c_str()), nullptr, TSCompileErrorTypeNone}; -} - -} // namespace tree_sitter diff --git a/src/compiler/compile_error.h b/src/compiler/compile_error.h deleted file mode 100644 index 9797a459..00000000 --- a/src/compiler/compile_error.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef COMPILER_COMPILE_ERROR_H_ -#define COMPILER_COMPILE_ERROR_H_ - -#include -#include "tree_sitter/compiler.h" - -namespace tree_sitter { - -class CompileError { - public: - CompileError() : type(TSCompileErrorTypeNone) {} - - CompileError(TSCompileErrorType type, std::string message) - : type(type), message(message) {} - - static CompileError none() { - return CompileError(TSCompileErrorTypeNone, ""); - } - - operator bool() const { - return type != TSCompileErrorTypeNone; - } - - bool operator==(const CompileError &other) const { - return type == other.type && message == other.message; - } - - TSCompileErrorType type; - std::string message; -}; - -} // namespace tree_sitter - -#endif // COMPILER_COMPILE_ERROR_H_ diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc deleted file mode 100644 index 14250037..00000000 --- a/src/compiler/generate_code/c_code.cc +++ /dev/null @@ -1,926 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "compiler/generate_code/c_code.h" -#include "compiler/lex_table.h" -#include "compiler/parse_table.h" -#include "compiler/syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/rule.h" -#include "compiler/util/string_helpers.h" -#include "tree_sitter/runtime.h" - -namespace tree_sitter { -namespace generate_code { - -using std::function; -using std::map; -using std::move; -using std::pair; -using std::set; -using std::string; -using std::to_string; -using std::unordered_map; -using std::unordered_set; -using std::vector; -using util::escape_char; -using rules::Symbol; -using rules::Alias; - -static const map REPLACEMENTS({ - { '~', "TILDE" }, - { '`', "BQUOTE" }, - { '!', "BANG" }, - { '@', "AT" }, - { '#', "POUND" }, - { '$', "DOLLAR" }, - { '%', "PERCENT" }, - { '^', "CARET" }, - { '&', "AMP" }, - { '*', "STAR" }, - { '(', "LPAREN" }, - { ')', "RPAREN" }, - { '-', "DASH" }, - { '+', "PLUS" }, - { '=', "EQ" }, - { '{', "LBRACE" }, - { '}', "RBRACE" }, - { '[', "LBRACK" }, - { ']', "RBRACK" }, - { '\\', "BSLASH" }, - { '|', "PIPE" }, - { ':', "COLON" }, - { ';', "SEMI" }, - { '"', "DQUOTE" }, - { '\'', "SQUOTE" }, - { '<', "LT" }, - { '>', "GT" }, - { ',', "COMMA" }, - { '.', "DOT" }, - { '?', "QMARK" }, - { '/', "SLASH" }, - { '\n', "LF" }, - { '\r', "CR" }, - { '\t', "TAB" }, -}); - -class CCodeGenerator { - string buffer; - size_t indent_level; - - const string name; - const ParseTable parse_table; - const LexTable main_lex_table; - const LexTable keyword_lex_table; - Symbol keyword_capture_token; - const SyntaxGrammar syntax_grammar; - const LexicalGrammar lexical_grammar; - unordered_map simple_aliases; - map symbol_ids; - vector> parse_table_entries; - vector> external_scanner_states; - size_t next_parse_action_list_index; - set unique_aliases; - - public: - CCodeGenerator( - string name, ParseTable &&parse_table, LexTable &&main_lex_table, - LexTable &&keyword_lex_table, Symbol keyword_capture_token, - SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar, - unordered_map &&simple_aliases - ) : indent_level(0), - name(name), - parse_table(move(parse_table)), - main_lex_table(move(main_lex_table)), - keyword_lex_table(move(keyword_lex_table)), - keyword_capture_token(keyword_capture_token), - syntax_grammar(move(syntax_grammar)), - lexical_grammar(move(lexical_grammar)), - simple_aliases(move(simple_aliases)), - next_parse_action_list_index(0) {} - - string code() { - buffer = ""; - - add_includes(); - add_pragmas(); - add_stats(); - add_symbol_enum(); - add_symbol_names_list(); - add_symbol_metadata_list(); - - if (parse_table.alias_sequences.size() > 1) { - add_alias_sequences(); - } - - add_lex_function("ts_lex", main_lex_table); - - if (keyword_capture_token != rules::NONE()) { - add_lex_function("ts_lex_keywords", keyword_lex_table); - } - - add_lex_modes_list(); - - if (!syntax_grammar.external_tokens.empty()) { - add_external_token_enum(); - add_external_scanner_symbol_map(); - add_external_scanner_states_list(); - } - - add_parse_table(); - add_parser_export(); - - return buffer; - } - - private: - void add_includes() { - add("#include "); - line(); - } - - void add_pragmas() { - line("#if defined(__GNUC__) || defined(__clang__)"); - line("#pragma GCC diagnostic push"); - line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); - line("#endif"); - line(); - - // Compiling large lexer functions can be very slow, especially when - // using Visual Studio on Windows. Disabling optimizations is not - // ideal, but only a very small fraction of overall parse time is - // spent lexing, so the performance impact of this is pretty small. - if (main_lex_table.states.size() > 500) { - line("#ifdef _MSC_VER"); - line("#pragma optimize(\"\", off)"); - line("#endif"); - line(); - } - } - - void add_stats() { - size_t token_count = 0; - for (const Symbol &symbol : parse_table.symbols) { - if (symbol.is_terminal()) { - token_count++; - } else if (symbol.is_external()) { - const ExternalToken &external_token = syntax_grammar.external_tokens[symbol.index]; - if (external_token.corresponding_internal_token == rules::NONE()) { - token_count++; - } - } - } - - for (const AliasSequence &alias_sequence : parse_table.alias_sequences) { - for (const Alias &alias : alias_sequence) { - if (!alias.value.empty()) { - unique_aliases.insert(alias); - } - } - } - - unordered_set symbol_id_values; - symbol_ids[rules::END_OF_INPUT()] = "ts_builtin_sym_end"; - - for (const Symbol &symbol : parse_table.symbols) { - if (!symbol.is_built_in()) { - assign_symbol_id(symbol, &symbol_id_values); - } - } - - for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { - Symbol symbol = Symbol::external(i); - if (!symbol_ids.count(symbol)) { - const ExternalToken &external_token = syntax_grammar.external_tokens[i]; - if (external_token.corresponding_internal_token == rules::NONE()) { - assign_symbol_id(Symbol::external(i), &symbol_id_values); - } else { - symbol_ids[Symbol::external(i)] = symbol_ids[external_token.corresponding_internal_token]; - } - } - } - - line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION)); - line("#define STATE_COUNT " + to_string(parse_table.states.size())); - line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define ALIAS_COUNT " + to_string(unique_aliases.size())); - line("#define TOKEN_COUNT " + to_string(token_count)); - line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); - line("#define MAX_ALIAS_SEQUENCE_LENGTH " + to_string(parse_table.max_alias_sequence_length)); - line(); - } - - void assign_symbol_id(const Symbol &symbol, unordered_set *symbol_id_values) { - auto entry = entry_for_symbol(symbol); - - string symbol_id; - switch (entry.second) { - case VariableTypeAuxiliary: - symbol_id = "aux_sym_" + sanitize_name(entry.first); - break; - case VariableTypeAnonymous: - symbol_id = "anon_sym_" + sanitize_name(entry.first); - break; - default: - symbol_id = "sym_" + sanitize_name(entry.first); - break; - } - - unsigned suffix_number = 1; - string unique_symbol_id = symbol_id; - while (symbol_id_values->count(unique_symbol_id)) { - suffix_number++; - unique_symbol_id = symbol_id + to_string(suffix_number); - } - - symbol_id_values->insert(unique_symbol_id); - symbol_ids[symbol] = unique_symbol_id; - } - - void add_symbol_enum() { - line("enum {"); - indent([&]() { - size_t i = 1; - for (const Symbol &symbol : parse_table.symbols) { - if (!symbol.is_built_in()) { - line(symbol_id(symbol) + " = " + to_string(i) + ","); - i++; - } - } - - for (const Alias &alias : unique_aliases) { - line(alias_id(alias) + " = " + to_string(i) + ","); - i++; - } - }); - line("};"); - line(); - } - - void add_symbol_names_list() { - line("static const char *ts_symbol_names[] = {"); - indent([&]() { - for (const Symbol &symbol : parse_table.symbols) { - line( - "[" + symbol_id(symbol) + "] = \"" + - sanitize_name_for_string(symbol_name(symbol)) + "\"," - ); - } - - for (const Alias &alias : unique_aliases) { - line( - "[" + alias_id(alias) + "] = \"" + - sanitize_name_for_string(alias.value) + "\"," - ); - } - }); - line("};"); - line(); - } - - void add_alias_sequences() { - line( - "static TSSymbol ts_alias_sequences[" + - to_string(parse_table.alias_sequences.size()) + - "][MAX_ALIAS_SEQUENCE_LENGTH] = {" - ); - - indent([&]() { - for (unsigned i = 1, n = parse_table.alias_sequences.size(); i < n; i++) { - const AliasSequence &sequence = parse_table.alias_sequences[i]; - line("[" + to_string(i) + "] = {"); - indent([&]() { - for (unsigned j = 0, n = sequence.size(); j < n; j++) { - if (!sequence[j].value.empty()) { - line("[" + to_string(j) + "] = " + alias_id(sequence[j]) + ","); - } - } - }); - line("},"); - } - }); - line("};"); - line(); - } - - void add_symbol_metadata_list() { - line("static const TSSymbolMetadata ts_symbol_metadata[] = {"); - indent([&]() { - for (const Symbol &symbol : parse_table.symbols) { - line("[" + symbol_id(symbol) + "] = {"); - indent([&]() { - switch (symbol_type(symbol)) { - case VariableTypeNamed: - line(".visible = true,"); - line(".named = true,"); - break; - case VariableTypeAnonymous: - line(".visible = true,"); - line(".named = false,"); - break; - case VariableTypeHidden: - line(".visible = false,"); - line(".named = true,"); - break; - case VariableTypeAuxiliary: - line(".visible = false,"); - line(".named = false,"); - break; - } - }); - - line("},"); - } - - for (const Alias &alias : unique_aliases) { - line("[" + alias_id(alias) + "] = {"); - indent([&]() { - line(".visible = true,"); - line(".named = " + _boolean(alias.is_named) + ","); - }); - line("},"); - } - }); - line("};"); - line(); - } - - void add_lex_function(string name, const LexTable &lex_table) { - line("static bool " + name + "(TSLexer *lexer, TSStateId state) {"); - indent([&]() { - line("START_LEXER();"); - _switch("state", [&]() { - size_t i = 0; - for (const LexState &state : lex_table.states) { - _case(to_string(i++), [&]() { add_lex_state(state); }); - } - _default([&]() { line("return false;"); }); - }); - }); - line("}"); - line(); - } - - void add_lex_modes_list() { - add_external_scanner_state({}); - - map external_tokens_by_corresponding_internal_token; - for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { - const ExternalToken &external_token = syntax_grammar.external_tokens[j]; - if (external_token.corresponding_internal_token.index == Symbol::Index(i)) { - external_tokens_by_corresponding_internal_token.insert({i, j}); - break; - } - } - } - - line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); - indent([&]() { - size_t state_id = 0; - - for (const auto &state : parse_table.states) { - line("[" + to_string(state_id++) + "] = {.lex_state = "); - add(to_string(state.lex_state_id)); - - set external_token_indices; - for (const auto &pair : state.terminal_entries) { - Symbol symbol = pair.first; - if (symbol.is_external()) { - external_token_indices.insert(symbol.index); - } else if (symbol.is_terminal()) { - auto corresponding_external_token = - external_tokens_by_corresponding_internal_token.find(symbol.index); - if (corresponding_external_token != external_tokens_by_corresponding_internal_token.end()) { - external_token_indices.insert(corresponding_external_token->second); - } - } - } - - if (!external_token_indices.empty()) { - add(", .external_lex_state = " + add_external_scanner_state(external_token_indices)); - } - - add("},"); - } - }); - line("};"); - line(); - } - - string add_external_scanner_state(set external_token_ids) { - for (size_t i = 0, n = external_scanner_states.size(); i < n; i++) - if (external_scanner_states[i] == external_token_ids) - return to_string(i); - external_scanner_states.push_back(external_token_ids); - return to_string(external_scanner_states.size() - 1); - } - - void add_external_token_enum() { - line("enum {"); - indent([&]() { - for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) - line(external_token_id(i) + ","); - }); - line("};"); - line(); - } - - void add_external_scanner_symbol_map() { - line("static TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {"); - indent([&]() { - for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { - line("[" + external_token_id(i) + "] = " + symbol_id(Symbol::external(i)) + ","); - } - }); - line("};"); - line(); - } - - void add_external_scanner_states_list() { - line("static bool ts_external_scanner_states["); - add(to_string(external_scanner_states.size())); - add("][EXTERNAL_TOKEN_COUNT] = {"); - indent([&]() { - size_t i = 0; - for (const auto &valid_external_lookaheads : external_scanner_states) { - if (!valid_external_lookaheads.empty()) { - line("[" + to_string(i) + "] = {"); - indent([&]() { - for (Symbol::Index id : valid_external_lookaheads) { - line("[" + external_token_id(id) + "] = true,"); - } - }); - line("},"); - } - i++; - } - }); - line("};"); - line(); - } - - void add_parse_table() { - add_parse_action_list_id(ParseTableEntry{ {}, false }); - - size_t state_id = 0; - line("static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {"); - - indent([&]() { - for (const auto &state : parse_table.states) { - line("[" + to_string(state_id++) + "] = {"); - indent([&]() { - for (const auto &entry : state.nonterminal_entries) { - line("[" + symbol_id(Symbol::non_terminal(entry.first)) + "] = STATE("); - add(to_string(entry.second)); - add("),"); - } - for (const auto &entry : state.terminal_entries) { - line("[" + symbol_id(entry.first) + "] = ACTIONS("); - add(to_string(add_parse_action_list_id(entry.second))); - add("),"); - } - }); - line("},"); - } - }); - - line("};"); - line(); - add_parse_action_list(); - line(); - } - - void add_parser_export() { - string language_function_name = "tree_sitter_" + name; - string external_scanner_name = language_function_name + "_external_scanner"; - - if (!syntax_grammar.external_tokens.empty()) { - line("void *" + external_scanner_name + "_create();"); - line("void " + external_scanner_name + "_destroy(void *);"); - line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); - line("unsigned " + external_scanner_name + "_serialize(void *, char *);"); - line("void " + external_scanner_name + "_deserialize(void *, const char *, unsigned);"); - line(); - } - - line("#ifdef _WIN32"); - line("#define extern __declspec(dllexport)"); - line("#endif"); - line(); - - line("extern const TSLanguage *" + language_function_name + "() {"); - indent([&]() { - line("static TSLanguage language = {"); - indent([&]() { - line(".version = LANGUAGE_VERSION,"); - line(".symbol_count = SYMBOL_COUNT,"); - line(".alias_count = ALIAS_COUNT,"); - line(".token_count = TOKEN_COUNT,"); - line(".symbol_metadata = ts_symbol_metadata,"); - line(".parse_table = (const unsigned short *)ts_parse_table,"); - line(".parse_actions = ts_parse_actions,"); - line(".lex_modes = ts_lex_modes,"); - line(".symbol_names = ts_symbol_names,"); - - if (parse_table.alias_sequences.size() > 1) { - line(".alias_sequences = (const TSSymbol *)ts_alias_sequences,"); - } - - line(".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH,"); - line(".lex_fn = ts_lex,"); - - if (keyword_capture_token != rules::NONE()) { - line(".keyword_lex_fn = ts_lex_keywords,"); - line(".keyword_capture_token = " + symbol_id(keyword_capture_token) + ","); - } - - line(".external_token_count = EXTERNAL_TOKEN_COUNT,"); - - if (!syntax_grammar.external_tokens.empty()) { - line(".external_scanner = {"); - indent([&]() { - line("(const bool *)ts_external_scanner_states,"); - line("ts_external_scanner_symbol_map,"); - line(external_scanner_name + "_create,"); - line(external_scanner_name + "_destroy,"); - line(external_scanner_name + "_scan,"); - line(external_scanner_name + "_serialize,"); - line(external_scanner_name + "_deserialize,"); - }); - line("},"); - } - }); - - line("};"); - line("return &language;"); - }); - line("}"); - line(); - } - - void add_lex_state(const LexState &lex_state) { - if (lex_state.accept_action.is_present()) { - add_accept_token_action(lex_state.accept_action); - } - - set ruled_out_characters; - for (const auto &pair : lex_state.advance_actions) { - if (pair.first.is_empty()) continue; - - size_t current_length = buffer.size(); - - line("if ("); - if (add_character_set_condition(pair.first, ruled_out_characters)) { - add(")"); - indent([&]() { add_advance_action(pair.second); }); - ruled_out_characters.insert(pair.first.included_chars.begin(), pair.first.included_chars.end()); - } else { - buffer.resize(current_length); - add_advance_action(pair.second); - } - } - - line("END_STATE();"); - } - - bool add_character_set_condition(const rules::CharacterSet &rule, const set &ruled_out_characters) { - if (rule.includes_all) { - return add_character_range_conditions(rule.excluded_ranges(), ruled_out_characters, true); - } else { - return add_character_range_conditions(rule.included_ranges(), ruled_out_characters, false); - } - } - - bool add_character_range_conditions(const vector &ranges, - const set &ruled_out_characters, - bool is_negated) { - bool first = true; - for (auto iter = ranges.begin(), end = ranges.end(); iter != end;) { - auto range = *iter; - - bool range_is_ruled_out = true; - for (uint32_t c = range.min; c <= range.max; c++) { - if (!ruled_out_characters.count(c)) { - range_is_ruled_out = false; - break; - } - } - - if (range_is_ruled_out) { - ++iter; - continue; - } - - auto next_iter = iter + 1; - while (next_iter != end) { - bool can_join_ranges = true; - for (uint32_t character = range.max + 1; character < next_iter->min; character++) { - if (!ruled_out_characters.count(character)) { - can_join_ranges = false; - break; - } - } - - if (can_join_ranges) { - range.max = next_iter->max; - ++next_iter; - } else { - break; - } - } - - if (!first) { - add(is_negated ? " &&" : " ||"); - line(" "); - } - - add_character_range_condition(range, is_negated); - first = false; - iter = next_iter; - } - - return !first; - } - - void add_character_range_condition(const rules::CharacterRange &range, bool is_negated) { - auto min = escape_char(range.min); - auto max = escape_char(range.max); - if (is_negated) { - if (range.max == range.min) { - add("lookahead != " + min); - } else if (range.max == range.min + 1) { - add("lookahead != " + min + " &&"); - line(" lookahead != " + max); - } else { - add("(lookahead < " + min + " || lookahead > " + max + ")"); - } - } else { - if (range.max == range.min) { - add("lookahead == " + min); - } else if (range.max == range.min + 1) { - add("lookahead == " + min + " ||"); - line(" lookahead == " + max); - } else { - add("(" + min + " <= lookahead && lookahead <= " + max + ")"); - } - } - } - - void add_advance_action(const AdvanceAction &action) { - if (action.in_main_token) { - line("ADVANCE(" + to_string(action.state_index) + ");"); - } else { - line("SKIP(" + to_string(action.state_index) + ");"); - } - } - - void add_accept_token_action(const AcceptTokenAction &action) { - line("ACCEPT_TOKEN(" + symbol_id(action.symbol) + ");"); - } - - void add_parse_action_list() { - line("static TSParseActionEntry ts_parse_actions[] = {"); - - indent([&]() { - for (const auto &pair : parse_table_entries) { - size_t index = pair.first; - line( - "[" + to_string(index) + "] = {" - ".count = " + to_string(pair.second.actions.size()) + ", " - ".reusable = " + _boolean(pair.second.reusable) + - "}," - ); - - for (const ParseAction &action : pair.second.actions) { - add(" "); - switch (action.type) { - case ParseActionTypeError: - break; - case ParseActionTypeAccept: - add("ACCEPT_INPUT()"); - break; - case ParseActionTypeShift: - if (action.extra) { - add("SHIFT_EXTRA()"); - } else if (action.repetition) { - add("SHIFT_REPEAT(" + to_string(action.state_index) + ")"); - } else { - add("SHIFT(" + to_string(action.state_index) + ")"); - } - break; - case ParseActionTypeReduce: - add("REDUCE("); - add(symbol_id(action.symbol)); - add(", "); - add(to_string(action.consumed_symbol_count)); - - if (action.dynamic_precedence != 0) { - add(", .dynamic_precedence = " + to_string(action.dynamic_precedence)); - } - - if (action.alias_sequence_id != 0) { - add(", .alias_sequence_id = " + to_string(action.alias_sequence_id)); - } - - add(")"); - break; - case ParseActionTypeRecover: - add("RECOVER()"); - break; - default: {} - } - add(","); - } - } - }); - - line("};"); - } - - size_t add_parse_action_list_id(const ParseTableEntry &entry) { - for (const auto &pair : parse_table_entries) { - if (pair.second == entry) { - return pair.first; - } - } - - size_t result = next_parse_action_list_index; - parse_table_entries.push_back({ next_parse_action_list_index, entry }); - next_parse_action_list_index += 1 + entry.actions.size(); - return result; - } - - // Helper functions - - string external_token_id(Symbol::Index index) { - return "ts_external_token_" + sanitize_name(syntax_grammar.external_tokens[index].name); - } - - string symbol_id(const Symbol &symbol) { - return symbol_ids[symbol]; - } - - string alias_id(const Alias &alias) { - if (alias.is_named) { - return "alias_sym_" + sanitize_name(alias.value); - } else { - return "anon_alias_sym_" + sanitize_name(alias.value); - } - } - - string symbol_name(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) { - return "END"; - } - - auto simple_alias_entry = simple_aliases.find(symbol); - if (simple_alias_entry != simple_aliases.end()) { - return simple_alias_entry->second.value; - } - - return entry_for_symbol(symbol).first; - } - - VariableType symbol_type(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) { - return VariableTypeHidden; - } - - auto simple_alias_entry = simple_aliases.find(symbol); - if (simple_alias_entry != simple_aliases.end()) { - return simple_alias_entry->second.is_named ? VariableTypeNamed : VariableTypeAnonymous; - } - - return entry_for_symbol(symbol).second; - } - - pair entry_for_symbol(const Symbol &symbol) { - switch (symbol.type) { - case Symbol::NonTerminal: { - const SyntaxVariable &variable = syntax_grammar.variables[symbol.index]; - return { variable.name, variable.type }; - } - case Symbol::Terminal: { - const LexicalVariable &variable = lexical_grammar.variables[symbol.index]; - return { variable.name, variable.type }; - } - case Symbol::External: - default: { - const ExternalToken &token = syntax_grammar.external_tokens[symbol.index]; - return { token.name, token.type }; - } - } - } - - // C-code generation functions - - void _switch(string condition, function body) { - line("switch (" + condition + ") {"); - indent(body); - line("}"); - } - - void _case(string value, function body) { - line("case " + value + ":"); - indent(body); - } - - void _default(function body) { - line("default:"); - indent(body); - } - - string sanitize_name_for_string(string name) { - util::str_replace(&name, "\\", "\\\\"); - util::str_replace(&name, "\n", "\\n"); - util::str_replace(&name, "\r", "\\r"); - util::str_replace(&name, "\"", "\\\""); - return name; - } - - string sanitize_name(const string &name) { - string result; - for (char c : name) { - if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || - ('0' <= c && c <= '9') || (c == '_')) { - result += c; - } else { - auto replacement = REPLACEMENTS.find(c); - size_t i = result.size(); - if (replacement != REPLACEMENTS.end()) { - if (i > 0 && result[i - 1] != '_') - result += "_"; - result += replacement->second; - } - } - } - return result; - } - - string _boolean(bool value) { - return value ? "true" : "false"; - } - - bool has_sanitized_name(const Symbol &symbol, string name) { - for (const auto &pair : symbol_ids) { - if (pair.second == name) { - return true; - } - } - return false; - } - - // General code generation functions - - void line() { - line(""); - } - - void line(string input) { - add("\n"); - if (!input.empty()) { - add_padding(); - add(input); - } - } - - void add_padding() { - for (size_t i = 0; i < indent_level; i++) - add(" "); - } - - void indent(function body) { - indent_level++; - body(); - indent_level--; - } - - void add(string input) { - buffer += input; - } -}; - -string c_code( - string name, ParseTable &&parse_table, LexTable &&lex_table, - LexTable &&keyword_lex_table, Symbol keyword_capture_token, - SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar, - unordered_map &&simple_aliases -) { - return CCodeGenerator( - name, - move(parse_table), - move(lex_table), - move(keyword_lex_table), - keyword_capture_token, - move(syntax_grammar), - move(lexical_grammar), - move(simple_aliases) - ).code(); -} - -} // namespace generate_code -} // namespace tree_sitter diff --git a/src/compiler/generate_code/c_code.h b/src/compiler/generate_code/c_code.h deleted file mode 100644 index a7ce3c7f..00000000 --- a/src/compiler/generate_code/c_code.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef COMPILER_GENERATE_CODE_C_CODE_H_ -#define COMPILER_GENERATE_CODE_C_CODE_H_ - -#include -#include -#include "compiler/rule.h" - -namespace tree_sitter { - -struct LexicalGrammar; -struct SyntaxGrammar; -struct LexTable; -struct ParseTable; - -namespace generate_code { - -std::string c_code( - std::string, - ParseTable &&, - LexTable &&, - LexTable &&, - rules::Symbol, - SyntaxGrammar &&, - LexicalGrammar &&, - std::unordered_map && -); - -} // namespace generate_code -} // namespace tree_sitter - -#endif // COMPILER_GENERATE_CODE_C_CODE_H_ diff --git a/src/compiler/generate_code/property_table_json.cc b/src/compiler/generate_code/property_table_json.cc deleted file mode 100644 index 45663f99..00000000 --- a/src/compiler/generate_code/property_table_json.cc +++ /dev/null @@ -1,115 +0,0 @@ -#include "compiler/generate_code/property_table_json.h" -#include -#include - -using std::string; -using std::to_string; -using std::vector; - -namespace tree_sitter { -namespace generate_code { - -class CodeGenerator { - string buffer; - - public: - string generate(const PropertyTable &table) { - add("{"); - add("\"states\":"); - add("["); - for (unsigned i = 0; i < table.states.size(); i++) { - const PropertyState &state = table.states[i]; - if (i != 0) add(","); - add_state(i, state); - } - add("],"); - add("\"property_sets\":"); - add("["); - bool first = true; - for (const PropertySet &property_set : table.property_sets) { - if (!first) add(","); - first = false; - add_property_set(property_set); - } - add("]"); - add("}"); - return buffer; - } - - private: - void add_state(unsigned i, const PropertyState &state) { - add("{"); - add("\"id\":"); - add(to_string(i)); - add(",\"property_set_id\":"); - add(to_string(state.property_set_id)); - add(","); - add("\"transitions\":["); - bool first = true; - for (const auto &transition : state.transitions) { - if (!first) add(","); - first = false; - add_transition(transition); - } - add("],"); - add("\"default_next_state_id\":"); - add(to_string(state.default_next_state_id)); - add("}"); - } - - void add_property_set(const PropertySet &property_set) { - add("{"); - bool first = true; - for (const auto &pair : property_set) { - if (!first) add(","); - first = false; - add_string(pair.first); - add(":"); - add_string(pair.second); - } - add("}"); - } - - void add_transition(const PropertyTransition &transition) { - add("{"); - add("\"type\":"); - add_string(transition.type); - add(",\"named\":"); - add(transition.named ? "true" : "false"); - if (transition.index != -1) { - add(",\"index\":"); - add(to_string(transition.index)); - } - if (!transition.text_pattern.empty()) { - add(",\"text\":"); - add_string(transition.text_pattern); - } - add(",\"state_id\": "); - add(to_string(transition.state_id)); - add("}"); - } - - void add_string(const string &s) { - add("\""); - for (const char c : s) { - if (c == '"' || c == '\\') add("\\"); - add(c); - } - add("\""); - } - - void add(string input) { - buffer += input; - } - - void add(char c) { - buffer += c; - } -}; - -string property_table_json(PropertyTable table) { - return CodeGenerator().generate(table); -} - -} // namespace generate_code -} // namespace tree_sitter diff --git a/src/compiler/generate_code/property_table_json.h b/src/compiler/generate_code/property_table_json.h deleted file mode 100644 index 7b6af342..00000000 --- a/src/compiler/generate_code/property_table_json.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef COMPILER_GENERATE_CODE_PROPERTY_TABLE_JSON_H_ -#define COMPILER_GENERATE_CODE_PROPERTY_TABLE_JSON_H_ - -#include -#include "compiler/property_table.h" - -namespace tree_sitter { -namespace generate_code { - -std::string property_table_json(PropertyTable); - -} // namespace generate_code -} // namespace tree_sitter - -#endif // COMPILER_GENERATE_CODE_PROPERTY_TABLE_JSON_H_ diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h deleted file mode 100644 index cc073c0b..00000000 --- a/src/compiler/grammar.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef COMPILER_GRAMMAR_H_ -#define COMPILER_GRAMMAR_H_ - -#include -#include -#include -#include -#include "compiler/rule.h" - -namespace tree_sitter { - -enum VariableType { - VariableTypeHidden, - VariableTypeAuxiliary, - VariableTypeAnonymous, - VariableTypeNamed, -}; - -struct Variable { - std::string name; - VariableType type; - rules::Rule rule; - - inline bool operator==(const Variable &other) const { - return name == other.name && rule == other.rule && type == other.type; - } -}; - -struct InputGrammar { - std::vector variables; - std::vector extra_tokens; - std::vector> expected_conflicts; - std::vector external_tokens; - std::unordered_set variables_to_inline; - rules::NamedSymbol word_token; -}; - -} // namespace tree_sitter - -#endif // COMPILER_GRAMMAR_H_ diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc deleted file mode 100644 index e13d6fcb..00000000 --- a/src/compiler/lex_table.cc +++ /dev/null @@ -1,51 +0,0 @@ -#include "compiler/lex_table.h" -#include "compiler/rule.h" - -namespace tree_sitter { - -using std::function; -using std::string; -using std::to_string; -using std::map; -using std::set; -using rules::Symbol; -using rules::CharacterSet; - -AdvanceAction::AdvanceAction() : state_index(-1) {} - -AdvanceAction::AdvanceAction(size_t state_index, - PrecedenceRange precedence_range, - bool in_main_token) - : state_index(state_index), - precedence_range(precedence_range), - in_main_token(in_main_token) {} - -bool AdvanceAction::operator==(const AdvanceAction &other) const { - return (state_index == other.state_index) && - (precedence_range == other.precedence_range); -} - -AcceptTokenAction::AcceptTokenAction() - : symbol(rules::NONE()), precedence(0), implicit_precedence(0) {} - -AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence) - : symbol(symbol), precedence(precedence), implicit_precedence(0) {} - -bool AcceptTokenAction::is_present() const { - return symbol != rules::NONE(); -} - -bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const { - return ( - symbol == other.symbol && - precedence == other.precedence && - implicit_precedence == other.implicit_precedence - ); -} - -bool LexState::operator==(const LexState &other) const { - return advance_actions == other.advance_actions && - accept_action == other.accept_action; -} - -} // namespace tree_sitter diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h deleted file mode 100644 index 9419e8e2..00000000 --- a/src/compiler/lex_table.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef COMPILER_LEX_TABLE_H_ -#define COMPILER_LEX_TABLE_H_ - -#include -#include -#include -#include -#include "compiler/precedence_range.h" -#include "compiler/rule.h" - -namespace tree_sitter { - -typedef int64_t LexStateId; - -struct AdvanceAction { - AdvanceAction(); - AdvanceAction(size_t, PrecedenceRange, bool); - bool operator==(const AdvanceAction &other) const; - inline bool operator!=(const AdvanceAction &other) const { return !operator==(other); } - - LexStateId state_index; - PrecedenceRange precedence_range; - bool in_main_token; -}; - -struct AcceptTokenAction { - AcceptTokenAction(); - AcceptTokenAction(rules::Symbol, int); - bool is_present() const; - bool operator==(const AcceptTokenAction &other) const; - inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); } - - rules::Symbol symbol; - int precedence; - int implicit_precedence; -}; - -struct LexState { - bool operator==(const LexState &) const; - - std::map advance_actions; - AcceptTokenAction accept_action; -}; - -struct LexTable { - std::vector states; -}; - -} // namespace tree_sitter - -#endif // COMPILER_LEX_TABLE_H_ diff --git a/src/compiler/lexical_grammar.h b/src/compiler/lexical_grammar.h deleted file mode 100644 index 78d3faa8..00000000 --- a/src/compiler/lexical_grammar.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef COMPILER_LEXICAL_GRAMMAR_H_ -#define COMPILER_LEXICAL_GRAMMAR_H_ - -#include -#include -#include -#include "compiler/rule.h" -#include "compiler/grammar.h" - -namespace tree_sitter { - -struct LexicalVariable { - std::string name; - VariableType type; - rules::Rule rule; - bool is_string; - - inline bool operator==(const LexicalVariable &other) const { - return other.name == name && other.type == type && other.rule == rule && - other.is_string == is_string; - } -}; - -struct LexicalGrammar { - std::vector variables; - std::vector separators; -}; - -} // namespace tree_sitter - -#endif // COMPILER_LEXICAL_GRAMMAR_H_ diff --git a/src/compiler/log.cc b/src/compiler/log.cc deleted file mode 100644 index 4b1e3dbf..00000000 --- a/src/compiler/log.cc +++ /dev/null @@ -1,33 +0,0 @@ -#include "compiler/log.h" -#include - -static const char *SPACES = " "; - -namespace tree_sitter { - -thread_local unsigned _indent_level = 0; -thread_local FILE *_log_file = nullptr; - -void set_log_file(FILE *file) { - _log_file = file; - _indent_level = 0; -} - -FILE *get_log_file() { - return _log_file; -} - -void _indent_logs() { - _indent_level++; -} - -void _outdent_logs() { - assert(_indent_level > 0); - _indent_level--; -} - -void _print_indent() { - fwrite(SPACES, 1, _indent_level * 4, _log_file); -} - -} diff --git a/src/compiler/log.h b/src/compiler/log.h deleted file mode 100644 index 2f7ad3e2..00000000 --- a/src/compiler/log.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef COMPILER_LOG_H_ -#define COMPILER_LOG_H_ - -#include - -namespace tree_sitter { - -void set_log_file(FILE *); -FILE *get_log_file(); -void _indent_logs(); -void _outdent_logs(); -void _print_indent(); - -#define LOG_START(...) \ - do { \ - LOG(__VA_ARGS__); \ - _indent_logs(); \ - } while (0) - -#define LOG_END(...) \ - do { \ - _outdent_logs(); \ - } while (0) - -#define LOG(...) \ - LOG_IF(true, __VA_ARGS__) - -#define LOG_IF(condition, ...) \ - do { \ - FILE *f = get_log_file(); \ - if (f && condition) { \ - _print_indent(); \ - fprintf(f, __VA_ARGS__); \ - fputs("\n", f); \ - } \ - } while (0) - -} // namespace tree_sitter - -#endif // COMPILER_LOG_H_ diff --git a/src/compiler/parse_json.cc b/src/compiler/parse_json.cc deleted file mode 100644 index 0b32ad6f..00000000 --- a/src/compiler/parse_json.cc +++ /dev/null @@ -1,469 +0,0 @@ -#include "compiler/parse_json.h" -#include -#include -#include -#include -#include "json.h" -#include "compiler/rule.h" -#include "compiler/util/result.h" - -namespace tree_sitter { - -using std::move; -using std::string; -using std::vector; -using std::unordered_set; -using std::pair; -using rules::Rule; -using rules::Blank; -using rules::Metadata; -using rules::Pattern; -using rules::String; -using rules::NamedSymbol; -using util::Result; - -Result parse_rule_json(json_value *rule_json) { - string error_message; - json_value rule_type_json; - string type; - - if (!rule_json) { - return "Rule cannot be null"; - } - - if (rule_json->type != json_object) { - return "Rule type must be an object"; - } - - rule_type_json = rule_json->operator[]("type"); - if (rule_type_json.type != json_string) { - return "Rule type must be a string"; - } - - type = rule_type_json.u.string.ptr; - - if (type == "BLANK") { - return Rule(Blank{}); - } - - if (type == "CHOICE") { - json_value members_json = rule_json->operator[]("members"); - if (members_json.type != json_array) { - return "Choice members must be an array"; - } - - vector members; - for (size_t i = 0, length = members_json.u.array.length; i < length; i++) { - json_value *member_json = members_json.u.array.values[i]; - auto result = parse_rule_json(member_json); - if (!result.ok()) { - return "Invalid choice member: " + result.error; - } - members.push_back(result.value); - } - return Rule::choice(members); - } - - if (type == "SEQ") { - json_value members_json = rule_json->operator[]("members"); - if (members_json.type != json_array) { - return "Seq members must be an array"; - } - - vector members; - for (size_t i = 0, length = members_json.u.array.length; i < length; i++) { - json_value *member_json = members_json.u.array.values[i]; - auto result = parse_rule_json(member_json); - if (!result.ok()) { - return "Invalid choice member: " + result.error; - } - members.push_back(result.value); - } - return Rule::seq(members); - } - - if (type == "REPEAT") { - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid repeat content: " + result.error; - } - return Rule::choice({Rule::repeat(result.value), Blank{}}); - } - - if (type == "REPEAT1") { - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid repeat content: " + result.error; - } - return Rule::repeat(result.value); - } - - if (type == "TOKEN") { - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid token content: " + result.error; - } - return Rule(Metadata::token(move(result.value))); - } - - if (type == "IMMEDIATE_TOKEN") { - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid token content: " + result.error; - } - return Rule(Metadata::immediate_token(move(result.value))); - } - - if (type == "PATTERN") { - json_value value_json = rule_json->operator[]("value"); - if (value_json.type == json_string) { - return Rule(Pattern{value_json.u.string.ptr}); - } else { - return "Pattern value must be a string"; - } - } - - if (type == "STRING") { - json_value value_json = rule_json->operator[]("value"); - if (value_json.type == json_string) { - return Rule(String{value_json.u.string.ptr}); - } else { - return "String rule value must be a string"; - } - } - - if (type == "SYMBOL") { - json_value value_json = rule_json->operator[]("name"); - if (value_json.type == json_string) { - return Rule(NamedSymbol{value_json.u.string.ptr}); - } else { - return "Symbol value must be a string"; - } - } - - if (type == "PREC") { - json_value precedence_json = rule_json->operator[]("value"); - if (precedence_json.type != json_integer) { - return "Precedence value must be an integer"; - } - - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid precedence content: " + result.error; - } - return Rule(Metadata::prec(precedence_json.u.integer, move(result.value))); - } - - if (type == "PREC_LEFT") { - json_value precedence_json = rule_json->operator[]("value"); - if (precedence_json.type != json_integer) { - return "Precedence value must be an integer"; - } - - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid precedence content: " + result.error; - } - return Rule(Metadata::prec_left(precedence_json.u.integer, move(result.value))); - } - - if (type == "PREC_RIGHT") { - json_value precedence_json = rule_json->operator[]("value"); - if (precedence_json.type != json_integer) { - return "Precedence value must be an integer"; - } - - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid precedence content: " + result.error; - } - return Rule(Metadata::prec_right(precedence_json.u.integer, move(result.value))); - } - - if (type == "PREC_DYNAMIC") { - json_value precedence_json = rule_json->operator[]("value"); - if (precedence_json.type != json_integer) { - return "Precedence value must be an integer"; - } - - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid precedence content: " + result.error; - } - return Rule(Metadata::prec_dynamic(precedence_json.u.integer, move(result.value))); - } - - if (type == "ALIAS") { - json_value value_json = rule_json->operator[]("value"); - if (value_json.type != json_string) { - return "Rename value must be a string"; - } - - json_value is_named_json = rule_json->operator[]("named"); - if (is_named_json.type != json_boolean) { - return "Rename named value must be a boolean"; - } - - json_value content_json = rule_json->operator[]("content"); - auto result = parse_rule_json(&content_json); - if (!result.ok()) { - return "Invalid rename content: " + result.error; - } - return Rule(Metadata::alias( - string(value_json.u.string.ptr), - is_named_json.u.boolean, - move(result.value) - )); - } - - return "Unknown rule type: " + type; -} - -ParseGrammarResult parse_grammar_json(const string &input) { - string error_message; - string name; - InputGrammar grammar; - json_value - name_json, rules_json, extras_json, conflicts_json, external_tokens_json, - inline_rules_json, word_rule_json; - - char parse_error[json_error_max]; - json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; - json_value *grammar_json = - json_parse_ex(&settings, input.c_str(), input.size(), parse_error); - if (!grammar_json) { - error_message = string("Invalid JSON at ") + parse_error; - goto error; - } - - if (grammar_json->type != json_object) { - error_message = "Body must be an object"; - goto error; - } - - name_json = grammar_json->operator[]("name"); - if (name_json.type != json_string) { - error_message = "Name must be a string"; - goto error; - } - - name = name_json.u.string.ptr; - - rules_json = grammar_json->operator[]("rules"); - if (rules_json.type != json_object) { - error_message = "Rules must be an object"; - goto error; - } - - for (size_t i = 0, length = rules_json.u.object.length; i < length; i++) { - json_object_entry entry_json = rules_json.u.object.values[i]; - auto result = parse_rule_json(entry_json.value); - if (!result.ok()) { - error_message = result.error; - goto error; - } - grammar.variables.push_back(Variable{ - string(entry_json.name), - VariableTypeNamed, - result.value - }); - } - - extras_json = grammar_json->operator[]("extras"); - if (extras_json.type != json_none) { - if (extras_json.type != json_array) { - error_message = "Extras must be an array"; - goto error; - } - - for (size_t i = 0, length = extras_json.u.array.length; i < length; i++) { - json_value *extra_json = extras_json.u.array.values[i]; - auto result = parse_rule_json(extra_json); - if (!result.ok()) { - error_message = "Invalid extra token: " + result.error; - goto error; - } - grammar.extra_tokens.push_back(result.value); - } - } - - conflicts_json = grammar_json->operator[]("conflicts"); - if (conflicts_json.type != json_none) { - if (conflicts_json.type != json_array) { - error_message = "Conflicts must be an array"; - goto error; - } - - for (size_t i = 0, length = conflicts_json.u.array.length; i < length; i++) { - json_value *conflict_json = conflicts_json.u.array.values[i]; - if (conflict_json->type != json_array) { - error_message = "Each conflict entry must be an array"; - goto error; - } - - unordered_set conflict; - for (size_t j = 0, conflict_length = conflict_json->u.array.length; - j < conflict_length; j++) { - json_value *conflict_entry_json = conflict_json->u.array.values[j]; - if (conflict_entry_json->type != json_string) { - error_message = "Each conflict entry must be an array of strings"; - goto error; - } - - conflict.insert(rules::NamedSymbol{ - string(conflict_entry_json->u.string.ptr) - }); - } - - grammar.expected_conflicts.push_back(conflict); - } - } - - inline_rules_json = grammar_json->operator[]("inline"); - if (inline_rules_json.type != json_none) { - if (inline_rules_json.type != json_array) { - error_message = "Inline rules must be an array"; - goto error; - } - - for (size_t i = 0, length = inline_rules_json.u.array.length; i < length; i++) { - json_value *inline_rule_json = inline_rules_json.u.array.values[i]; - if (inline_rule_json->type != json_string) { - error_message = "Inline rules must be an array of rule names"; - goto error; - } - - grammar.variables_to_inline.insert(rules::NamedSymbol{string(inline_rule_json->u.string.ptr)}); - } - } - - external_tokens_json = grammar_json->operator[]("externals"); - if (external_tokens_json.type != json_none) { - if (external_tokens_json.type != json_array) { - error_message = "External tokens must be an array"; - goto error; - } - - for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) { - json_value *external_token_json = external_tokens_json.u.array.values[i]; - auto result = parse_rule_json(external_token_json); - if (!result.ok()) { - error_message = "Invalid external token: " + result.error; - goto error; - } - grammar.external_tokens.push_back(result.value); - } - } - - word_rule_json = grammar_json->operator[]("word"); - if (word_rule_json.type != json_none) { - if (word_rule_json.type != json_string) { - error_message = "Invalid word property"; - goto error; - } - - grammar.word_token = NamedSymbol { word_rule_json.u.string.ptr }; - } - - json_value_free(grammar_json); - return { name, grammar, "" }; - -error: - if (grammar_json) json_value_free(grammar_json); - return { "", InputGrammar(), error_message }; -} - -Result parse_property_rule_json(json_value *rule_json) { - PropertyRule result; - - if (rule_json->type != json_object) return "Rule must be an object"; - - json_value selectors_json = rule_json->operator[]("selectors"); - if (selectors_json.type != json_array) return "Selectors must be an array"; - - for (unsigned i = 0; i < selectors_json.u.array.length; i++) { - PropertySelector selector; - json_value *selector_json = selectors_json.u.array.values[i]; - if (selector_json->type != json_array) return "Each selector must be an array"; - - for (unsigned j = 0; j < selector_json->u.array.length; j++) { - json_value *selector_step_json = selector_json->u.array.values[j]; - if (selector_step_json->type != json_object) return "Each selector must be an array of objects"; - PropertySelectorStep step; - step.type = selector_step_json->operator[]("type").u.string.ptr; - step.named = selector_step_json->operator[]("named").u.boolean; - step.is_immediate = selector_step_json->operator[]("immediate").u.boolean; - - json_value index_json = selector_step_json->operator[]("index"); - if (index_json.type == json_integer) { - step.index = index_json.u.integer; - } else { - step.index = -1; - } - - json_value text_pattern_json = selector_step_json->operator[]("text"); - if (text_pattern_json.type == json_string) { - step.text_pattern = text_pattern_json.u.string.ptr; - } - - selector.push_back(step); - } - - result.selectors.push_back(selector); - } - - json_value properties_json = rule_json->operator[]("properties"); - if (properties_json.type != json_object) return "Properties must be an object"; - - for (unsigned i = 0; i < properties_json.u.object.length; i++) { - json_object_entry entry_json = properties_json.u.object.values[i]; - json_value *value_json = entry_json.value; - if (value_json->type != json_string) return "Property values must be strings"; - result.properties[entry_json.name] = value_json->u.string.ptr; - } - - return result; -} - -Result parse_property_sheet_json(const string &input) { - PropertySheet sheet; - string error_message; - char parse_error[json_error_max]; - json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; - json_value *sheet_json = json_parse_ex(&settings, input.c_str(), input.size(), parse_error); - if (!sheet_json) { - error_message = string("Invalid JSON at ") + parse_error; - goto error; - } - - if (sheet_json->type != json_array) { - error_message = "Property sheet must be an array"; - goto error; - } - - for (unsigned i = 0; i < sheet_json->u.array.length; i++) { - json_value *rule_json = sheet_json->u.array.values[i]; - auto result = parse_property_rule_json(rule_json); - if (!result.ok()) { - error_message = "Invalid external token: " + result.error; - goto error; - } - sheet.push_back(result.value); - } - - return sheet; - -error: - if (sheet_json) json_value_free(sheet_json); - return error_message.c_str(); -} - -} // namespace tree_sitter diff --git a/src/compiler/parse_json.h b/src/compiler/parse_json.h deleted file mode 100644 index fda7378c..00000000 --- a/src/compiler/parse_json.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef COMPILER_PARSE_JSON_H_ -#define COMPILER_PARSE_JSON_H_ - -#include -#include -#include "tree_sitter/compiler.h" -#include "compiler/grammar.h" -#include "compiler/property_sheet.h" -#include "compiler/util/result.h" - -namespace tree_sitter { - -struct ParseGrammarResult { - std::string name; - InputGrammar grammar; - std::string error_message; -}; - -struct ParsePropertySheetResult { - PropertySheet property_sheet; - std::string error_message; -}; - -ParseGrammarResult parse_grammar_json(const std::string &); -util::Result parse_property_sheet_json(const std::string &); - -} // namespace tree_sitter - -#endif // COMPILER_PARSE_JSON_H_ diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc deleted file mode 100644 index 252185f4..00000000 --- a/src/compiler/parse_table.cc +++ /dev/null @@ -1,159 +0,0 @@ -#include "compiler/parse_table.h" -#include -#include "compiler/precedence_range.h" -#include "compiler/rule.h" - -namespace tree_sitter { - -using std::string; -using std::to_string; -using std::vector; -using std::function; -using rules::Symbol; - -ParseAction::ParseAction() : - type(ParseActionTypeError), - state_index(-1), - symbol(rules::NONE()), - consumed_symbol_count(0), - precedence(0), - dynamic_precedence(0), - associativity(rules::AssociativityNone), - alias_sequence_id(0), - extra(false), - repetition(false) {} - -ParseAction ParseAction::Error() { - return ParseAction(); -} - -ParseAction ParseAction::Accept() { - ParseAction action; - action.type = ParseActionTypeAccept; - return action; -} - -ParseAction ParseAction::Shift(ParseStateId state_index) { - ParseAction result; - result.type = ParseActionTypeShift; - result.state_index = state_index; - return result; -} - -ParseAction ParseAction::Recover() { - ParseAction result; - result.type = ParseActionTypeRecover; - return result; -} - -ParseAction ParseAction::ShiftExtra() { - ParseAction action; - action.type = ParseActionTypeShift; - action.extra = true; - return action; -} - -ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, - int precedence, int dynamic_precedence, - rules::Associativity associativity, unsigned alias_sequence_id) { - ParseAction result; - result.type = ParseActionTypeReduce; - result.symbol = symbol; - result.consumed_symbol_count = consumed_symbol_count; - result.precedence = precedence; - result.dynamic_precedence = dynamic_precedence; - result.associativity = associativity; - result.alias_sequence_id = alias_sequence_id; - return result; -} - -bool ParseAction::operator==(const ParseAction &other) const { - return - type == other.type && - state_index == other.state_index && - symbol == other.symbol && - consumed_symbol_count == other.consumed_symbol_count && - precedence == other.precedence && - dynamic_precedence == other.dynamic_precedence && - associativity == other.associativity && - alias_sequence_id == other.alias_sequence_id && - extra == other.extra && - repetition == other.repetition; -} - -bool ParseAction::operator<(const ParseAction &other) const { - if (type < other.type) return true; - if (other.type < type) return false; - if (state_index < other.state_index) return true; - if (other.state_index < state_index) return false; - if (symbol < other.symbol) return true; - if (other.symbol < symbol) return false; - if (consumed_symbol_count < other.consumed_symbol_count) return true; - if (other.consumed_symbol_count < consumed_symbol_count) return false; - if (precedence < other.precedence) return true; - if (other.precedence < precedence) return false; - if (dynamic_precedence < other.dynamic_precedence) return true; - if (other.dynamic_precedence < dynamic_precedence) return false; - if (associativity < other.associativity) return true; - if (other.associativity < associativity) return false; - if (extra && !other.extra) return true; - if (other.extra && !extra) return false; - if (repetition && !other.repetition) return true; - if (other.repetition && !repetition) return false; - return alias_sequence_id < other.alias_sequence_id; -} - -ParseTableEntry::ParseTableEntry() : reusable(true) {} - -ParseTableEntry::ParseTableEntry(const vector &actions, bool reusable) - : actions(actions), - reusable(reusable) {} - -bool ParseTableEntry::operator==(const ParseTableEntry &other) const { - return actions == other.actions && reusable == other.reusable; -} - -ParseState::ParseState() : lex_state_id(-1) {} - -bool ParseState::has_shift_action() const { - for (const auto &pair : terminal_entries) - if (pair.second.actions.size() > 0 && - pair.second.actions.back().type == ParseActionTypeShift) - return true; - return (!nonterminal_entries.empty()); -} - -bool ParseState::has_terminal_entry(rules::Symbol symbol) const { - return terminal_entries.find(symbol) != terminal_entries.end(); -} - -void ParseState::each_referenced_state(function fn) { - for (auto &entry : terminal_entries) - for (ParseAction &action : entry.second.actions) - if (action.type == ParseActionTypeShift && !action.extra) - fn(&action.state_index); - for (auto &entry : nonterminal_entries) - fn(&entry.second); -} - -bool ParseState::operator==(const ParseState &other) const { - return terminal_entries == other.terminal_entries && - nonterminal_entries == other.nonterminal_entries; -} - -ParseAction &ParseTable::add_terminal_action(ParseStateId state_id, - Symbol lookahead, - ParseAction action) { - ParseTableEntry &entry = states[state_id].terminal_entries[lookahead]; - entry.actions.push_back(action); - return *entry.actions.rbegin(); -} - -void ParseTable::set_nonterminal_action(ParseStateId state_id, - Symbol::Index lookahead, - ParseStateId next_state_id) { - symbols.insert(Symbol::non_terminal(lookahead)); - states[state_id].nonterminal_entries[lookahead] = next_state_id; -} - -} // namespace tree_sitter diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h deleted file mode 100644 index bf85c4b7..00000000 --- a/src/compiler/parse_table.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef COMPILER_PARSE_TABLE_H_ -#define COMPILER_PARSE_TABLE_H_ - -#include -#include -#include -#include -#include "compiler/lex_table.h" -#include "compiler/rule.h" -#include "compiler/precedence_range.h" -#include "compiler/syntax_grammar.h" - -namespace tree_sitter { - -typedef size_t ParseStateId; - -enum ParseActionType { - ParseActionTypeError, - ParseActionTypeShift, - ParseActionTypeReduce, - ParseActionTypeAccept, - ParseActionTypeRecover, -}; - -struct ParseAction { - ParseAction(); - static ParseAction Accept(); - static ParseAction Error(); - static ParseAction Shift(ParseStateId state_index); - static ParseAction Recover(); - static ParseAction Reduce(rules::Symbol symbol, size_t child_count, - int precedence, int dynamic_precedence, rules::Associativity, - unsigned alias_sequence_id); - static ParseAction ShiftExtra(); - bool operator==(const ParseAction &) const; - bool operator<(const ParseAction &) const; - - ParseActionType type; - ParseStateId state_index; - rules::Symbol symbol; - unsigned consumed_symbol_count; - int precedence; - int dynamic_precedence; - rules::Associativity associativity; - unsigned alias_sequence_id; - bool extra; - bool repetition; -}; - -struct ParseTableEntry { - ParseTableEntry(); - ParseTableEntry(const std::vector &, bool); - bool operator==(const ParseTableEntry &other) const; - inline bool operator!=(const ParseTableEntry &other) const { - return !operator==(other); - } - - std::vector actions; - bool reusable; -}; - -struct ParseState { - ParseState(); - bool operator==(const ParseState &) const; - bool merge(const ParseState &); - void each_referenced_state(std::function); - bool has_shift_action() const; - bool has_terminal_entry(rules::Symbol) const; - - std::map terminal_entries; - std::map nonterminal_entries; - LexStateId lex_state_id; -}; - -using AliasSequence = std::vector; - -struct ParseTable { - ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction); - void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId); - - std::vector states; - std::set symbols; - std::vector alias_sequences; - unsigned max_alias_sequence_length = 0; -}; - -} // namespace tree_sitter - -#endif // COMPILER_PARSE_TABLE_H_ diff --git a/src/compiler/precedence_range.cc b/src/compiler/precedence_range.cc deleted file mode 100644 index 2f6b7ecf..00000000 --- a/src/compiler/precedence_range.cc +++ /dev/null @@ -1,45 +0,0 @@ -#include "compiler/precedence_range.h" - -namespace tree_sitter { - -PrecedenceRange::PrecedenceRange() : min(0), max(0), empty(true) {} - -PrecedenceRange::PrecedenceRange(int min, int max) - : min(min), max(max), empty(false) {} - -PrecedenceRange::PrecedenceRange(int value) - : min(value), max(value), empty(false) {} - -void PrecedenceRange::add(int new_value) { - if (empty) { - min = new_value; - max = new_value; - empty = false; - } else { - if (new_value < min) - min = new_value; - else if (new_value > max) - max = new_value; - } -} - -void PrecedenceRange::add(const PrecedenceRange &other) { - if (!other.empty) { - add(other.min); - add(other.max); - } -} - -bool PrecedenceRange::operator<(const PrecedenceRange &other) const { - if (empty) - return !other.empty; - else - return (min < other.min && max <= other.min) || - (min == other.min && max < other.max); -} - -bool PrecedenceRange::operator==(const PrecedenceRange &other) const { - return (empty == other.empty) && (min == other.min) && (max == other.max); -} - -} // namespace tree_sitter diff --git a/src/compiler/precedence_range.h b/src/compiler/precedence_range.h deleted file mode 100644 index f2f52de4..00000000 --- a/src/compiler/precedence_range.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef COMPILER_PRECEDENCE_RANGE_H_ -#define COMPILER_PRECEDENCE_RANGE_H_ - -namespace tree_sitter { - -struct PrecedenceRange { - PrecedenceRange(); - explicit PrecedenceRange(int value); - PrecedenceRange(int min, int max); - - void add(int value); - void add(const PrecedenceRange &); - bool operator==(const PrecedenceRange &other) const; - bool operator<(const PrecedenceRange &other) const; - - int min; - int max; - bool empty; -}; - -} // namespace tree_sitter - -#endif // COMPILER_PRECEDENCE_RANGE_H_ diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc deleted file mode 100644 index 101beb41..00000000 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ /dev/null @@ -1,114 +0,0 @@ -#include "compiler/prepare_grammar/expand_repeats.h" -#include -#include -#include -#include -#include "compiler/grammar.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::string; -using std::vector; -using std::pair; -using std::to_string; -using rules::Rule; -using rules::Symbol; - -class ExpandRepeats { - string rule_name; - size_t offset; - size_t repeat_count; - vector> existing_repeats; - - Rule apply(Rule rule) { - return rule.match( - [&](const rules::Blank &blank) -> Rule { return blank; }, - [&](const rules::Symbol &symbol) { return symbol; }, - - [&](const rules::Choice &choice) { - vector elements; - for (const auto &element : choice.elements) { - elements.push_back(apply(element)); - } - return Rule::choice(elements); - }, - - [&](const rules::Seq &sequence) { - auto left = apply(*sequence.left); - auto right = apply(*sequence.right); - return rules::Seq{left, right}; - }, - - [&](const rules::Repeat &repeat) { - for (const auto pair : existing_repeats) { - if (pair.first == rule) { - return pair.second; - } - } - - Rule inner_rule = apply(*repeat.rule); - size_t index = aux_rules.size(); - string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count); - Symbol repeat_symbol = Symbol::non_terminal(offset + index); - existing_repeats.push_back({repeat, repeat_symbol}); - aux_rules.push_back({ - helper_rule_name, - VariableTypeAuxiliary, - rules::Choice{{ - rules::Seq{repeat_symbol, repeat_symbol}, - inner_rule, - }} - }); - return repeat_symbol; - }, - - [&](const rules::Metadata &metadata) { - return rules::Metadata{apply(*metadata.rule), metadata.params}; - }, - - [](auto) { - assert(!"Unexpected rule type"); - return rules::Blank{}; - } - ); - } - - public: - explicit ExpandRepeats(size_t offset) : offset(offset) {} - - Rule expand(const Rule &rule, const string &name) { - rule_name = name; - repeat_count = 0; - return apply(rule); - } - - vector aux_rules; -}; - -InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { - InitialSyntaxGrammar result; - result.variables = grammar.variables; - result.extra_tokens = grammar.extra_tokens; - result.expected_conflicts = grammar.expected_conflicts; - result.external_tokens = grammar.external_tokens; - result.variables_to_inline = grammar.variables_to_inline; - - ExpandRepeats expander(result.variables.size()); - for (auto &variable : result.variables) { - variable.rule = expander.expand(variable.rule, variable.name); - } - - result.variables.insert( - result.variables.end(), - expander.aux_rules.begin(), - expander.aux_rules.end() - ); - - result.word_token = grammar.word_token; - return result; -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.h b/src/compiler/prepare_grammar/expand_repeats.h deleted file mode 100644 index 57e1474f..00000000 --- a/src/compiler/prepare_grammar/expand_repeats.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_REPEATS_H_ -#define COMPILER_PREPARE_GRAMMAR_EXPAND_REPEATS_H_ - -#include "compiler/prepare_grammar/initial_syntax_grammar.h" - -namespace tree_sitter { -namespace prepare_grammar { - -InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_EXPAND_REPEATS_H_ diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc deleted file mode 100644 index 82fb9302..00000000 --- a/src/compiler/prepare_grammar/expand_tokens.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "compiler/prepare_grammar/expand_tokens.h" -#include -#include -#include -#include "compiler/lexical_grammar.h" -#include "compiler/rule.h" -#include "compiler/prepare_grammar/parse_regex.h" -#include "utf8proc.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::string; -using std::vector; -using rules::Rule; - -ExpandTokenResult expand_token(const rules::Rule &rule) { - return rule.match( - [](const rules::Blank &blank) -> ExpandTokenResult { return Rule(blank); }, - - [](const rules::String &string) { - vector elements; - const uint8_t *iter = reinterpret_cast(string.value.data()); - const uint8_t *end = iter + string.value.size(); - - while (iter < end) { - int32_t el; - size_t size = utf8proc_iterate(iter, (end - iter), &el); - if (!size) - break; - iter += size; - - elements.push_back(rules::CharacterSet().include(el)); - } - - return Rule::seq(elements); - }, - - [](const rules::Pattern &pattern) -> ExpandTokenResult { - auto result = parse_regex(pattern.value); - if (result.second) return result.second; - return result.first; - }, - - [](const rules::Repeat &rule) -> ExpandTokenResult { - auto result = expand_token(*rule.rule); - if (result.error) return result.error; - return Rule::repeat(result.rule); - }, - - [](const rules::Metadata &rule) -> ExpandTokenResult { - auto result = expand_token(*rule.rule); - if (result.error) return result.error; - return Rule(rules::Metadata{result.rule, rule.params}); - }, - - [](const rules::Seq &rule) -> ExpandTokenResult { - auto left_result = expand_token(*rule.left); - if (left_result.error) return left_result.error; - auto right_result = expand_token(*rule.right); - if (right_result.error) return right_result.error; - return Rule(rules::Seq{left_result.rule, right_result.rule}); - }, - - [](const rules::Choice &rule) -> ExpandTokenResult { - std::vector elements; - for (const auto &element : rule.elements) { - auto result = expand_token(element); - if (result.error) return result.error; - elements.push_back(result.rule); - } - return Rule(rules::Choice{elements}); - }, - - [](auto) { return CompileError(TSCompileErrorTypeInvalidTokenContents, "Symbols inside tokens are not allowed."); } - ); -}; - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_tokens.h b/src/compiler/prepare_grammar/expand_tokens.h deleted file mode 100644 index d1545cca..00000000 --- a/src/compiler/prepare_grammar/expand_tokens.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_ -#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_ - -#include -#include "compiler/rule.h" -#include "compiler/compile_error.h" - -namespace tree_sitter { -namespace prepare_grammar { - -struct ExpandTokenResult { - rules::Rule rule; - CompileError error; - - ExpandTokenResult(const rules::Rule &rule) : rule(rule) {} - ExpandTokenResult(const CompileError &error) : error(error) {} -}; - -ExpandTokenResult expand_token(const rules::Rule &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_ diff --git a/src/compiler/prepare_grammar/extract_choices.cc b/src/compiler/prepare_grammar/extract_choices.cc deleted file mode 100644 index 3b471538..00000000 --- a/src/compiler/prepare_grammar/extract_choices.cc +++ /dev/null @@ -1,49 +0,0 @@ -#include "compiler/prepare_grammar/extract_choices.h" -#include -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::vector; -using rules::Rule; - -vector extract_choices(const Rule &rule) { - return rule.match( - [](const rules::Seq &sequence) { - vector result; - for (auto &left_entry : extract_choices(*sequence.left)) { - for (auto &right_entry : extract_choices(*sequence.right)) { - result.push_back(rules::Rule::seq({left_entry, right_entry})); - } - } - return result; - }, - - [](const rules::Metadata &rule) { - vector result; - for (auto &entry : extract_choices(*rule.rule)) { - result.push_back(rules::Metadata{entry, rule.params}); - } - return result; - }, - - [](const rules::Choice &choice) { - vector result; - for (auto &element : choice.elements) { - for (auto &entry : extract_choices(element)) { - result.push_back(entry); - } - } - return result; - }, - - [](const auto &rule) { - return vector({rule}); - } - ); -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/extract_choices.h b/src/compiler/prepare_grammar/extract_choices.h deleted file mode 100644 index 3b0d12db..00000000 --- a/src/compiler/prepare_grammar/extract_choices.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_ -#define COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_ - -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -std::vector extract_choices(const rules::Rule &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_ diff --git a/src/compiler/prepare_grammar/extract_simple_aliases.cc b/src/compiler/prepare_grammar/extract_simple_aliases.cc deleted file mode 100644 index 208fe6f4..00000000 --- a/src/compiler/prepare_grammar/extract_simple_aliases.cc +++ /dev/null @@ -1,111 +0,0 @@ -#include "compiler/prepare_grammar/extract_simple_aliases.h" -#include "compiler/lexical_grammar.h" -#include "compiler/syntax_grammar.h" -#include -#include - -namespace tree_sitter { -namespace prepare_grammar { - -using std::pair; -using std::vector; -using std::unordered_map; -using rules::Alias; -using rules::Symbol; - -template -static void apply_alias(T *variable, Alias alias) { - if (!alias.value.empty()) { - variable->name = alias.value; - variable->type = alias.is_named ? VariableTypeNamed : VariableTypeAnonymous; - } -} - -std::unordered_map -extract_simple_aliases(SyntaxGrammar *syntax_grammar, LexicalGrammar *lexical_grammar) { - struct SymbolStatus { - Alias alias; - bool eligible = true; - }; - - vector terminal_status_list(lexical_grammar->variables.size()); - vector non_terminal_status_list(syntax_grammar->variables.size()); - vector external_status_list(syntax_grammar->external_tokens.size()); - - for (const SyntaxVariable &variable : syntax_grammar->variables) { - for (const Production &production : variable.productions) { - for (const ProductionStep &step : production.steps) { - SymbolStatus *status; - if (step.symbol.is_built_in()) { - continue; - } else if (step.symbol.is_external()) { - status = &external_status_list[step.symbol.index]; - } else if (step.symbol.is_terminal()) { - status = &terminal_status_list[step.symbol.index]; - } else { - status = &non_terminal_status_list[step.symbol.index]; - } - - if (step.alias.value.empty()) { - status->alias = Alias(); - status->eligible = false; - } - - if (status->eligible) { - if (status->alias.value.empty()) { - status->alias = step.alias; - } else if (status->alias != step.alias) { - status->alias = Alias(); - status->eligible = false; - } - } - } - } - } - - for (SyntaxVariable &variable : syntax_grammar->variables) { - for (Production &production : variable.productions) { - for (ProductionStep &step : production.steps) { - SymbolStatus *status; - if (step.symbol.is_built_in()) { - continue; - } else if (step.symbol.is_external()) { - status = &external_status_list[step.symbol.index]; - } else if (step.symbol.is_terminal()) { - status = &terminal_status_list[step.symbol.index]; - } else { - status = &non_terminal_status_list[step.symbol.index]; - } - - if (!status->alias.value.empty()) { - step.alias = Alias(); - } - } - } - } - - unordered_map result; - - for (unsigned i = 0, n = terminal_status_list.size(); i < n; i++) { - if (!terminal_status_list[i].alias.value.empty()) { - result[Symbol::terminal(i)] = terminal_status_list[i].alias; - } - } - - for (unsigned i = 0, n = non_terminal_status_list.size(); i < n; i++) { - if (!non_terminal_status_list[i].alias.value.empty()) { - result[Symbol::non_terminal(i)] = non_terminal_status_list[i].alias; - } - } - - for (unsigned i = 0, n = external_status_list.size(); i < n; i++) { - if (!external_status_list[i].alias.value.empty()) { - result[Symbol::external(i)] = external_status_list[i].alias; - } - } - - return result; -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/extract_simple_aliases.h b/src/compiler/prepare_grammar/extract_simple_aliases.h deleted file mode 100644 index 9970ad1a..00000000 --- a/src/compiler/prepare_grammar/extract_simple_aliases.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_ -#define COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_ - -#include "compiler/rules/symbol.h" -#include "compiler/rules/metadata.h" -#include - -namespace tree_sitter { - -struct SyntaxGrammar; -struct LexicalGrammar; - -namespace prepare_grammar { - -std::unordered_map -extract_simple_aliases(SyntaxGrammar *, LexicalGrammar *); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_ diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc deleted file mode 100644 index bf01e722..00000000 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ /dev/null @@ -1,350 +0,0 @@ -#include "compiler/prepare_grammar/extract_tokens.h" -#include -#include -#include -#include -#include -#include -#include "tree_sitter/compiler.h" -#include "compiler/lexical_grammar.h" -#include "compiler/rule.h" -#include "compiler/prepare_grammar/token_description.h" -#include "compiler/prepare_grammar/expand_tokens.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::make_tuple; -using std::map; -using std::set; -using std::string; -using std::tuple; -using std::vector; -using rules::Symbol; -using rules::Rule; -using rules::Rule; - -class SymbolReplacer { - public: - map replacements; - - Rule apply(const Rule &rule) { - return rule.match( - [](const rules::Blank &blank) -> Rule { - return blank; - }, - - [this](const rules::Symbol &symbol) { - return replace_symbol(symbol); - }, - - [this](const rules::Choice &choice) { - vector elements; - for (const auto &element : choice.elements) { - elements.push_back(apply(element)); - } - return Rule::choice(elements); - }, - - [this](const rules::Seq &sequence) { - return rules::Seq{ - apply(*sequence.left), - apply(*sequence.right) - }; - }, - - [this](const rules::Repeat &repeat) { - return Rule::repeat(apply(*repeat.rule)); - }, - - [this](const rules::Metadata &metadata) { - return rules::Metadata{apply(*metadata.rule), metadata.params}; - }, - - [](auto) { - assert(!"Unexpected rule type"); - return rules::Blank{}; - } - ); - } - - Symbol replace_symbol(const Symbol &symbol) { - if (!symbol.is_non_terminal()) return symbol; - - auto replacement_pair = replacements.find(symbol); - if (replacement_pair != replacements.end()) { - return replacement_pair->second; - } - - int new_index = symbol.index; - for (const auto &pair : replacements) { - if (pair.first.index < symbol.index) { - new_index--; - } - } - - return Symbol::non_terminal(new_index); - } -}; - -class TokenExtractor { - Symbol extract_token(const rules::Rule &input, VariableType entry_type) { - for (size_t i = 0; i < tokens.size(); i++) { - if (tokens[i].rule == input) { - token_usage_counts[i]++; - return Symbol::terminal(i); - } - } - - size_t index = tokens.size(); - tokens.push_back({ - token_description(input), - entry_type, - input - }); - token_usage_counts.push_back(1); - - return Symbol::terminal(index); - } - - public: - Rule apply(const rules::Rule &rule) { - return rule.match( - [](const rules::Blank &blank) -> Rule { return blank; }, - - [this](const rules::Metadata &rule) -> Rule { - if (rule.params.is_token) { - rules::Metadata metadata{*rule.rule, rule.params}; - metadata.params.is_token = false; - if (metadata.params == rules::MetadataParams{}) { - return extract_token(*metadata.rule, VariableTypeAuxiliary); - } else if (metadata.rule->is()) { - return extract_token(metadata, VariableTypeAnonymous); - } else { - return extract_token(metadata, VariableTypeAuxiliary); - } - } else { - return rules::Metadata{apply(*rule.rule), rule.params}; - } - }, - - [this](const rules::String &rule) { - return extract_token(rule, VariableTypeAnonymous); - }, - - [this](const rules::Pattern &rule) { - return extract_token(rule, VariableTypeAuxiliary); - }, - - [this](const rules::Repeat &rule) { - return Rule::repeat(apply(*rule.rule)); - }, - - [this](const rules::Seq &rule) { - return Rule::seq({apply(*rule.left), apply(*rule.right)}); - }, - - [this](const rules::Choice &rule) { - std::vector elements; - for (const auto &element : rule.elements) { - elements.push_back(apply(element)); - } - return Rule::choice(elements); - }, - - [](const rules::Symbol &symbol) { - return symbol; - }, - - [](auto) { - assert(!"Unexpected rule type"); - return rules::Blank{}; - } - ); - } - - vector token_usage_counts; - vector tokens; -}; - -tuple extract_tokens( - const InternedGrammar &grammar -) { - InitialSyntaxGrammar syntax_grammar; - LexicalGrammar lexical_grammar; - SymbolReplacer symbol_replacer; - TokenExtractor extractor; - - // Extract all of the grammar's tokens into the lexical grammar. - vector processed_variables; - for (const auto &variable : grammar.variables) { - processed_variables.push_back({ - variable.name, - variable.type, - extractor.apply(variable.rule) - }); - } - - vector processed_external_tokens; - for (const auto &external_token : grammar.external_tokens) { - processed_external_tokens.push_back(Variable{ - external_token.name, - external_token.type, - extractor.apply(external_token.rule), - }); - } - - for (const auto &extracted_token : extractor.tokens) { - auto expansion = expand_token(extracted_token.rule); - if (expansion.error) return make_tuple( - syntax_grammar, - lexical_grammar, - expansion.error - ); - lexical_grammar.variables.push_back({ - extracted_token.name, - extracted_token.type, - expansion.rule, - extracted_token.type == VariableTypeAnonymous - }); - } - - // If a variable's entire rule was extracted as a token and that token didn't - // appear within any other rule, then remove that variable from the syntax - // grammar, giving its name to the token in the lexical grammar. Any symbols - // that pointed to that variable will need to be updated to point to the - // variable in the lexical grammar. Symbols that pointed to later variables - // will need to have their indices decremented. - size_t i = -1; - for (const auto &variable : processed_variables) { - i++; - if (i > 0 && variable.rule.is()) { - auto symbol = variable.rule.get_unchecked(); - if (symbol.is_terminal() && extractor.token_usage_counts[symbol.index] == 1) { - lexical_grammar.variables[symbol.index].type = variable.type; - lexical_grammar.variables[symbol.index].name = variable.name; - symbol_replacer.replacements[Symbol::non_terminal(i)] = symbol; - continue; - } - } - syntax_grammar.variables.push_back(variable); - } - - // Perform any replacements of symbols needed based on the previous step. - for (auto &variable : syntax_grammar.variables) { - variable.rule = symbol_replacer.apply(variable.rule); - } - - for (const auto &conflict_set : grammar.expected_conflicts) { - set new_conflict_set; - for (const Symbol &symbol : conflict_set) { - new_conflict_set.insert(symbol_replacer.replace_symbol(symbol)); - } - syntax_grammar.expected_conflicts.insert(new_conflict_set); - } - - for (const Symbol &symbol : grammar.variables_to_inline) { - syntax_grammar.variables_to_inline.insert(symbol_replacer.replace_symbol(symbol)); - } - - // The grammar's extra tokens can be either token rules or symbols - // pointing to token rules. If they are symbols, then they'll be handled by - // the parser; add them to the syntax grammar's extra tokens. If they - // are anonymous rules, they can be handled by the lexer; add them to the - // lexical grammar's separator rules. - for (const auto &rule : grammar.extra_tokens) { - CompileError error = rule.match( - [&](const Symbol &symbol) { - Symbol new_symbol = symbol_replacer.replace_symbol(symbol); - if (new_symbol.is_non_terminal()) { - return CompileError( - TSCompileErrorTypeInvalidExtraToken, - "Non-token symbol " + syntax_grammar.variables[new_symbol.index].name + " can't be used as an extra token" - ); - } else { - syntax_grammar.extra_tokens.insert(new_symbol); - return CompileError::none(); - } - }, - - [&](auto non_symbol) { - auto expansion = expand_token(non_symbol); - if (expansion.error) return CompileError( - TSCompileErrorTypeInvalidExtraToken, - "Non-token rule expression can't be used as an extra token" - ); - int i = 0; - for (const LexicalVariable &variable : lexical_grammar.variables) { - if (variable.rule == expansion.rule) { - syntax_grammar.extra_tokens.insert(Symbol::terminal(i)); - return CompileError::none(); - } - i++; - } - - lexical_grammar.separators.push_back(expansion.rule); - return CompileError::none(); - } - ); - - if (error) return make_tuple(syntax_grammar, lexical_grammar, error); - } - - for (const auto &external_token : processed_external_tokens) { - Rule new_rule = symbol_replacer.apply(external_token.rule); - - if (!new_rule.is()) { - return make_tuple( - syntax_grammar, - lexical_grammar, - CompileError( - TSCompileErrorTypeInvalidExternalToken, - "Non-symbol rule expressions can't be used as external tokens" - ) - ); - } - - Symbol symbol = new_rule.get_unchecked(); - if (symbol.is_non_terminal()) { - return make_tuple( - syntax_grammar, - lexical_grammar, - CompileError( - TSCompileErrorTypeInvalidExternalToken, - "Name '" + external_token.name + "' cannot be used for both an external token and a non-terminal rule" - ) - ); - } - - if (symbol.is_external()) { - syntax_grammar.external_tokens.push_back(ExternalToken{ - external_token.name, - external_token.type, - rules::NONE(), - }); - } else { - syntax_grammar.external_tokens.push_back(ExternalToken{ - lexical_grammar.variables[symbol.index].name, - external_token.type, - symbol, - }); - } - } - - syntax_grammar.word_token = symbol_replacer.replace_symbol(grammar.word_token); - if (syntax_grammar.word_token.is_non_terminal()) { - return make_tuple( - syntax_grammar, - lexical_grammar, - CompileError( - TSCompileErrorTypeInvalidWordRule, - "Word rules must be tokens" - ) - ); - } - - return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/extract_tokens.h b/src/compiler/prepare_grammar/extract_tokens.h deleted file mode 100644 index 73da39fd..00000000 --- a/src/compiler/prepare_grammar/extract_tokens.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_ -#define COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_ - -#include -#include "compiler/compile_error.h" -#include "compiler/lexical_grammar.h" -#include "compiler/prepare_grammar/interned_grammar.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" - -namespace tree_sitter { -namespace prepare_grammar { - -std::tuple extract_tokens( - const InternedGrammar & -); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_ diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc deleted file mode 100644 index fad02a23..00000000 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ /dev/null @@ -1,170 +0,0 @@ -#include "compiler/prepare_grammar/flatten_grammar.h" -#include -#include -#include -#include -#include -#include "compiler/prepare_grammar/extract_choices.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/grammar.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::find; -using std::pair; -using std::string; -using std::vector; -using rules::Rule; -using rules::Symbol; - -class FlattenRule { - private: - vector precedence_stack; - vector associativity_stack; - vector alias_stack; - Production production; - - void apply(const Rule &rule, bool at_end) { - rule.match( - [&](const rules::Symbol &symbol) { - production.steps.push_back(ProductionStep{ - symbol, - precedence_stack.back(), - associativity_stack.back(), - alias_stack.back() - }); - }, - - [&](const rules::Metadata &metadata) { - if (metadata.params.has_precedence) { - precedence_stack.push_back(metadata.params.precedence); - } - - if (metadata.params.has_associativity) { - associativity_stack.push_back(metadata.params.associativity); - } - - if (!metadata.params.alias.value.empty()) { - alias_stack.push_back(metadata.params.alias); - } - - if (abs(metadata.params.dynamic_precedence) > abs(production.dynamic_precedence)) { - production.dynamic_precedence = metadata.params.dynamic_precedence; - } - - apply(*metadata.rule, at_end); - - if (metadata.params.has_precedence) { - precedence_stack.pop_back(); - if (!at_end) production.back().precedence = precedence_stack.back(); - } - - if (metadata.params.has_associativity) { - associativity_stack.pop_back(); - if (!at_end) production.back().associativity = associativity_stack.back(); - } - - if (!metadata.params.alias.value.empty()) { - alias_stack.pop_back(); - } - }, - - [&](const rules::Seq &sequence) { - apply(*sequence.left, false); - apply(*sequence.right, at_end); - }, - - [&](const rules::Blank &blank) {}, - - [&](auto) { - assert(!"Unexpected rule type"); - } - ); - } - - public: - FlattenRule() : - precedence_stack({0}), - associativity_stack({rules::AssociativityNone}), - alias_stack({rules::Alias{}}) {} - - Production flatten(const Rule &rule) { - apply(rule, true); - return production; - } -}; - -SyntaxVariable flatten_rule(const Variable &variable) { - vector productions; - - for (const Rule &rule_component : extract_choices(variable.rule)) { - Production production = FlattenRule().flatten(rule_component); - auto end = productions.end(); - if (find(productions.begin(), end, production) == end) { - productions.push_back(production); - } - } - - return SyntaxVariable{variable.name, variable.type, productions}; -} - -static bool variable_is_used(const SyntaxGrammar &grammar, Symbol::Index symbol_index) { - for (const SyntaxVariable &variable : grammar.variables) { - for (const Production &production : variable.productions) { - for (const auto &step : production) { - if (step.symbol == Symbol::non_terminal(symbol_index)) { - return true; - } - } - } - } - return false; -} - -pair flatten_grammar(const InitialSyntaxGrammar &grammar) { - SyntaxGrammar result; - result.external_tokens = grammar.external_tokens; - result.variables_to_inline = grammar.variables_to_inline; - - for (const auto &expected_conflict : grammar.expected_conflicts) { - result.expected_conflicts.insert({ - expected_conflict.begin(), - expected_conflict.end(), - }); - } - - for (const rules::Symbol &extra_token : grammar.extra_tokens) { - result.extra_tokens.insert(extra_token); - } - - for (const auto &variable : grammar.variables) { - result.variables.push_back(flatten_rule(variable)); - } - - Symbol::Index i = 0; - for (const auto &variable : result.variables) { - for (const Production &production : variable.productions) { - if (production.empty() && variable_is_used(result, i)) { - return { - result, - CompileError( - TSCompileErrorTypeEpsilonRule, - "The rule `" + variable.name + "` matches the empty string.\n\n" + - "Tree-sitter does not support syntactic rules that match the empty string\n" - "unless they are used only as the grammar's start rule.\n" - ) - }; - } - } - i++; - } - - result.word_token = grammar.word_token; - - return {result, CompileError::none()}; -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/flatten_grammar.h b/src/compiler/prepare_grammar/flatten_grammar.h deleted file mode 100644 index 73873d61..00000000 --- a/src/compiler/prepare_grammar/flatten_grammar.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_FLATTEN_GRAMMAR_H_ -#define COMPILER_PREPARE_GRAMMAR_FLATTEN_GRAMMAR_H_ - -#include -#include "tree_sitter/compiler.h" -#include "compiler/compile_error.h" -#include "compiler/grammar.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/syntax_grammar.h" - -namespace tree_sitter { -namespace prepare_grammar { - -SyntaxVariable flatten_rule(const Variable &variable); -std::pair flatten_grammar(const InitialSyntaxGrammar &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_FLATTEN_GRAMMAR_H_ diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h deleted file mode 100644 index 7e763d02..00000000 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ -#define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ - -#include -#include -#include "tree_sitter/compiler.h" -#include "compiler/grammar.h" -#include "compiler/syntax_grammar.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -struct InitialSyntaxGrammar { - std::vector variables; - std::set extra_tokens; - std::set> expected_conflicts; - std::vector external_tokens; - std::set variables_to_inline; - rules::Symbol word_token; -}; - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_ diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc deleted file mode 100644 index 6cf34f30..00000000 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ /dev/null @@ -1,175 +0,0 @@ -#include "compiler/prepare_grammar/intern_symbols.h" -#include -#include -#include -#include -#include "tree_sitter/compiler.h" -#include "compiler/grammar.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::string; -using std::vector; -using std::set; -using std::pair; -using rules::Symbol; -using rules::Rule; - -class SymbolInterner { - public: - Rule apply(const Rule &rule) { - return rule.match( - [&](const rules::Blank &blank) -> Rule { - return blank; - }, - - [&](const rules::NamedSymbol &symbol) { - return intern_symbol(symbol); - }, - - [&](const rules::String &string) { - return string; - }, - - [&](const rules::Pattern &pattern) { - return pattern; - }, - - [&](const rules::Choice &choice) { - vector elements; - for (const auto &element : choice.elements) { - elements.push_back(apply(element)); - } - return rules::Choice{elements}; - }, - - [&](const rules::Seq &sequence) { - return rules::Seq{apply(*sequence.left), apply(*sequence.right)}; - }, - - [&](const rules::Repeat &repeat) { - return rules::Repeat{apply(*repeat.rule)}; - }, - - [&](const rules::Metadata &metadata) { - return rules::Metadata{apply(*metadata.rule), metadata.params}; - }, - - [](auto) { - assert(!"Unexpected rule type"); - return rules::Blank{}; - } - ); - } - - Symbol intern_symbol(rules::NamedSymbol named_symbol) { - for (size_t i = 0; i < grammar.variables.size(); i++) { - if (grammar.variables[i].name == named_symbol.value) { - return Symbol::non_terminal(i); - } - } - - for (size_t i = 0; i < grammar.external_tokens.size(); i++) { - if (grammar.external_tokens[i] == named_symbol) { - return Symbol::external(i); - } - } - - missing_rule_name = named_symbol.value; - return rules::NONE(); - } - - explicit SymbolInterner(const InputGrammar &grammar) : grammar(grammar) {} - const InputGrammar &grammar; - string missing_rule_name; -}; - -CompileError missing_rule_error(string rule_name) { - return CompileError(TSCompileErrorTypeUndefinedSymbol, - "Undefined rule '" + rule_name + "'"); -} - -pair intern_symbols(const InputGrammar &grammar) { - InternedGrammar result; - - SymbolInterner interner(grammar); - - for (const Rule &external_token : grammar.external_tokens) { - string external_token_name; - VariableType external_token_type = VariableTypeAnonymous; - external_token.match( - [&](rules::NamedSymbol named_symbol) { - external_token_name = named_symbol.value; - if (external_token_name[0] == '_') { - external_token_type = VariableTypeHidden; - } else { - external_token_type =VariableTypeNamed; - } - }, - [](auto rule) {} - ); - - auto new_rule = interner.apply(external_token); - if (!interner.missing_rule_name.empty()) { - return { result, missing_rule_error(interner.missing_rule_name) }; - } - - result.external_tokens.push_back(Variable{ - external_token_name, - external_token_type, - new_rule, - }); - } - - if (grammar.variables[0].name[0] == '_') { - return {result, CompileError(TSCompileErrorTypeInvalidRuleName, "A grammar's start rule must be visible.")}; - } - - for (auto &variable : grammar.variables) { - auto new_rule = interner.apply(variable.rule); - if (!interner.missing_rule_name.empty()) { - return { result, missing_rule_error(interner.missing_rule_name) }; - } - - result.variables.push_back(Variable{ - variable.name, - variable.name[0] == '_' ? VariableTypeHidden : VariableTypeNamed, - new_rule - }); - } - - for (auto &rule : grammar.extra_tokens) { - auto new_rule = interner.apply(rule); - if (!interner.missing_rule_name.empty()) { - return { result, missing_rule_error(interner.missing_rule_name) }; - } - result.extra_tokens.push_back(new_rule); - } - - for (auto &expected_conflict : grammar.expected_conflicts) { - set entry; - for (auto &named_symbol : expected_conflict) { - auto symbol = interner.intern_symbol(named_symbol); - if (symbol != rules::NONE()) { - entry.insert(symbol); - } - } - result.expected_conflicts.insert(entry); - } - - for (auto &named_symbol : grammar.variables_to_inline) { - auto symbol = interner.intern_symbol(named_symbol); - if (symbol != rules::NONE()) { - result.variables_to_inline.insert(symbol); - } - } - - result.word_token = interner.intern_symbol(grammar.word_token); - - return {result, CompileError::none()}; -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/intern_symbols.h b/src/compiler/prepare_grammar/intern_symbols.h deleted file mode 100644 index 8e8f2abe..00000000 --- a/src/compiler/prepare_grammar/intern_symbols.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_INTERN_SYMBOLS_H_ -#define COMPILER_PREPARE_GRAMMAR_INTERN_SYMBOLS_H_ - -#include -#include -#include "compiler/compile_error.h" -#include "compiler/prepare_grammar/interned_grammar.h" - -namespace tree_sitter { - -struct InputGrammar; - -namespace prepare_grammar { - -std::pair intern_symbols(const InputGrammar &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_INTERN_SYMBOLS_H_ diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h deleted file mode 100644 index 405172b1..00000000 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_ -#define COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_ - -#include -#include -#include "tree_sitter/compiler.h" -#include "compiler/grammar.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -struct InternedGrammar { - std::vector variables; - std::vector extra_tokens; - std::set> expected_conflicts; - std::vector external_tokens; - std::set variables_to_inline; - rules::Symbol word_token; -}; - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_ diff --git a/src/compiler/prepare_grammar/normalize_rules.cc b/src/compiler/prepare_grammar/normalize_rules.cc deleted file mode 100644 index 28602b2b..00000000 --- a/src/compiler/prepare_grammar/normalize_rules.cc +++ /dev/null @@ -1,21 +0,0 @@ -#include "compiler/prepare_grammar/normalize_rules.h" -#include "compiler/prepare_grammar/extract_choices.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::vector; -using rules::Rule; - -LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) { - LexicalGrammar result(input_grammar); - - for (LexicalVariable &variable : result.variables) { - variable.rule = Rule::choice(extract_choices(variable.rule)); - } - - return result; -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/normalize_rules.h b/src/compiler/prepare_grammar/normalize_rules.h deleted file mode 100644 index 4938e116..00000000 --- a/src/compiler/prepare_grammar/normalize_rules.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_NORMALIZE_RULES_H_ -#define COMPILER_PREPARE_GRAMMAR_NORMALIZE_RULES_H_ - -#include "compiler/lexical_grammar.h" - -namespace tree_sitter { -namespace prepare_grammar { - -LexicalGrammar normalize_rules(const LexicalGrammar &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_NORMALIZE_RULES_H_ diff --git a/src/compiler/prepare_grammar/parse_regex.cc b/src/compiler/prepare_grammar/parse_regex.cc deleted file mode 100644 index 2b386907..00000000 --- a/src/compiler/prepare_grammar/parse_regex.cc +++ /dev/null @@ -1,337 +0,0 @@ -#include "compiler/prepare_grammar/parse_regex.h" -#include -#include -#include -#include -#include "compiler/rule.h" -#include "compiler/util/string_helpers.h" -#include "utf8proc.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::string; -using std::vector; -using std::pair; -using std::iswdigit; -using rules::CharacterSet; -using rules::Blank; -using rules::Rule; - -class PatternParser { - public: - explicit PatternParser(const string &input) - : input(input), - iter((const uint8_t *)input.data()), - end(iter + input.size()) { - next(); - } - - pair rule(bool nested) { - vector choices; - do { - if (!choices.empty()) { - if (peek() == '|') { - next(); - } else { - break; - } - } - auto pair = term(nested); - if (pair.second.type) { - return {Blank{}, pair.second }; - } - choices.push_back(pair.first); - } while (has_more_input()); - return {Rule::choice(choices), CompileError::none()}; - } - - private: - pair term(bool nested) { - Rule result; - do { - if (peek() == '|') - break; - if (nested && peek() == ')') - break; - auto pair = factor(); - if (pair.second) { - return {Blank{}, pair.second}; - } - result = Rule::seq({result, pair.first}); - } while (has_more_input()); - return { result, CompileError::none() }; - } - - pair factor() { - auto pair = atom(); - if (pair.second.type) { - return {Blank{}, pair.second}; - } - - Rule result = pair.first; - if (has_more_input()) { - switch (peek()) { - case '*': - next(); - result = Rule::choice({ - Rule::repeat(result), - Blank{} - }); - break; - case '+': - next(); - result = Rule::repeat(result); - break; - case '?': - next(); - result = Rule::choice({result, Blank{}}); - break; - case '{': { - Checkpoint checkpoint = get_checkpoint(); - next(); - - string min_repeat_string; - while (iswdigit(peek())) { - min_repeat_string += (char)peek(); - next(); - } - - bool has_comma = false; - string max_repeat_string; - if (peek() == ',') { - next(); - has_comma = true; - while (iswdigit(peek())) { - max_repeat_string += (char)peek(); - next(); - } - } - - if (peek() == '}' && (!min_repeat_string.empty() || has_comma)) { - next(); - if (min_repeat_string.size()) { - unsigned min_count = std::stoi(min_repeat_string); - vector entries(min_count, result); - if (max_repeat_string.size()) { - unsigned max_count = std::stoi(max_repeat_string); - if (max_count < min_count) { - return error("numbers out of order in {} quantifier"); - } - vector optional_entries(max_count - min_count, Rule::choice({result, Blank{}})); - entries.insert(entries.end(), optional_entries.begin(), optional_entries.end()); - } else if (has_comma) { - entries.push_back(Rule::choice({Rule::repeat(result), Blank{} })); - } - result = Rule::seq(entries); - } else if (max_repeat_string.size()) { - unsigned max_count = std::stoi(max_repeat_string); - vector optional_entries(max_count, Rule::choice({result, Blank{}})); - result = Rule::seq(optional_entries); - } else { - result = Rule::repeat(result); - } - } else { - revert(checkpoint); - } - - break; - } - } - } - - return {result, CompileError::none()}; - } - - pair atom() { - switch (peek()) { - case '(': { - next(); - auto pair = rule(true); - if (pair.second.type) { - return {Blank{}, pair.second}; - } - if (peek() != ')') { - return error("unmatched open paren"); - } - next(); - return {pair.first, CompileError::none()}; - } - - case '[': { - next(); - auto pair = char_set(); - if (pair.second.type) { - return {Blank{}, pair.second}; - } - if (peek() != ']') { - return error("unmatched open square bracket"); - } - next(); - return {pair.first, CompileError::none()}; - } - - case ')': { - return error("unmatched close paren"); - } - - case ']': { - return error("unmatched close square bracket"); - } - - case '.': { - next(); - return { - CharacterSet().include_all().exclude('\n'), - CompileError::none() - }; - } - - default: { - return {single_char(), CompileError::none()}; - } - } - } - - pair char_set() { - CharacterSet result; - bool is_affirmative = true; - if (peek() == '^') { - next(); - is_affirmative = false; - result.include_all(); - } - - while (has_more_input() && (peek() != ']')) { - auto characters = single_char(); - - if (peek() == '-') { - next(); - if (!characters.includes_all && characters.included_chars.size() == 1 && peek() != ']') { - auto next_characters = single_char(); - if (!next_characters.includes_all && next_characters.included_chars.size() == 1) { - characters.include( - *characters.included_chars.begin(), - *next_characters.included_chars.begin() - ); - } else { - characters.include('-'); - characters.add_set(next_characters); - } - } else { - characters.include('-'); - } - } - - if (is_affirmative) - result.add_set(characters); - else - result.remove_set(characters); - } - - return { result, CompileError::none() }; - } - - CharacterSet single_char() { - CharacterSet value; - if (peek() == '\\') { - next(); - value = escaped_char(peek()); - next(); - } else { - value = CharacterSet().include(peek()); - next(); - } - return value; - } - - CharacterSet escaped_char(uint32_t value) { - switch (value) { - case 'w': - return CharacterSet() - .include('a', 'z') - .include('A', 'Z') - .include('0', '9') - .include('_'); - case 'W': - return CharacterSet() - .include_all() - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_'); - case 'd': - return CharacterSet().include('0', '9'); - case 'D': - return CharacterSet().include_all().exclude('0', '9'); - case 's': - return CharacterSet() - .include(' ') - .include('\t') - .include('\n') - .include('\r'); - case 'S': - return CharacterSet() - .include_all() - .exclude(' ') - .exclude('\t') - .exclude('\n') - .exclude('\r'); - case '0': - return CharacterSet().include('\0'); - case 't': - return CharacterSet().include('\t'); - case 'n': - return CharacterSet().include('\n'); - case 'r': - return CharacterSet().include('\r'); - default: - return CharacterSet().include(value); - } - } - - void next() { - size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead); - if (!lookahead_size) - lookahead = 0; - iter += lookahead_size; - } - - struct Checkpoint { - const uint8_t *iter; - int32_t lookahead; - }; - - Checkpoint get_checkpoint() { - return Checkpoint{iter, lookahead}; - } - - void revert(Checkpoint checkpoint) { - iter = checkpoint.iter; - lookahead = checkpoint.lookahead; - } - - uint32_t peek() { - return lookahead; - } - - bool has_more_input() { - return lookahead && iter <= end; - } - - pair error(string msg) { - return { Blank{}, CompileError(TSCompileErrorTypeInvalidRegex, msg) }; - } - - string input; - const uint8_t *iter; - const uint8_t *end; - int32_t lookahead; -}; - -pair parse_regex(const std::string &input) { - return PatternParser(input.c_str()).rule(false); -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/parse_regex.h b/src/compiler/prepare_grammar/parse_regex.h deleted file mode 100644 index b1c03f30..00000000 --- a/src/compiler/prepare_grammar/parse_regex.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_ -#define COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_ - -#include -#include -#include "compiler/rule.h" -#include "compiler/compile_error.h" - -namespace tree_sitter { -namespace prepare_grammar { - -std::pair parse_regex(const std::string &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_ diff --git a/src/compiler/prepare_grammar/prepare_grammar.cc b/src/compiler/prepare_grammar/prepare_grammar.cc deleted file mode 100644 index aef16846..00000000 --- a/src/compiler/prepare_grammar/prepare_grammar.cc +++ /dev/null @@ -1,66 +0,0 @@ -#include "compiler/prepare_grammar/prepare_grammar.h" -#include "compiler/prepare_grammar/expand_repeats.h" -#include "compiler/prepare_grammar/expand_tokens.h" -#include "compiler/prepare_grammar/extract_tokens.h" -#include "compiler/prepare_grammar/extract_simple_aliases.h" -#include "compiler/prepare_grammar/intern_symbols.h" -#include "compiler/prepare_grammar/flatten_grammar.h" -#include "compiler/prepare_grammar/normalize_rules.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/syntax_grammar.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::get; -using std::move; - -PrepareGrammarResult prepare_grammar(const InputGrammar &input_grammar) { - PrepareGrammarResult result; - - // Convert all string-based `NamedSymbols` into numerical `Symbols` - auto intern_result = intern_symbols(input_grammar); - CompileError error = intern_result.second; - if (error.type) { - result.error = error; - return result; - } - - // Separate grammar into lexical and syntactic components - auto extract_result = extract_tokens(intern_result.first); - error = get<2>(extract_result); - if (error.type) { - result.error = error; - return result; - } - - // Replace `Repeat` rules with pairs of recursive rules - InitialSyntaxGrammar syntax_grammar1 = expand_repeats(get<0>(extract_result)); - - // Flatten syntax rules into lists of productions. - auto flatten_result = flatten_grammar(syntax_grammar1); - SyntaxGrammar syntax_grammar = flatten_result.first; - error = flatten_result.second; - if (error.type) { - result.error = error; - return result; - } - - // Ensure all lexical rules are in a consistent format. - LexicalGrammar lexical_grammar = normalize_rules(get<1>(extract_result)); - - // Find any symbols that always have the same alias applied to them. - // Remove those aliases since they can be applied in a simpler way. - auto simple_aliases = extract_simple_aliases(&syntax_grammar, &lexical_grammar); - - return { - move(syntax_grammar), - move(lexical_grammar), - move(simple_aliases), - CompileError::none(), - }; -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/prepare_grammar.h b/src/compiler/prepare_grammar/prepare_grammar.h deleted file mode 100644 index d71beed1..00000000 --- a/src/compiler/prepare_grammar/prepare_grammar.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_ -#define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_ - -#include -#include "compiler/grammar.h" -#include "compiler/syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/compile_error.h" - -namespace tree_sitter { -namespace prepare_grammar { - -struct PrepareGrammarResult { - SyntaxGrammar syntax_grammar; - LexicalGrammar lexical_grammar; - std::unordered_map simple_aliases; - CompileError error; -}; - -PrepareGrammarResult prepare_grammar(const InputGrammar &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_ diff --git a/src/compiler/prepare_grammar/token_description.cc b/src/compiler/prepare_grammar/token_description.cc deleted file mode 100644 index f10904ba..00000000 --- a/src/compiler/prepare_grammar/token_description.cc +++ /dev/null @@ -1,83 +0,0 @@ -#include "compiler/prepare_grammar/token_description.h" -#include "compiler/rule.h" -#include "compiler/util/string_helpers.h" - -namespace tree_sitter { -namespace prepare_grammar { - -using std::string; -using rules::Rule; - -class TokenDescription { - bool is_trivial; - - string apply(const Rule &rule) { - return rule.match( - [&](const rules::Blank) -> string { - return ""; - }, - - [&](const rules::Symbol) { - return ""; - }, - - [&](const rules::Pattern &rule) { - is_trivial = false; - return rule.value; - }, - - [&](const rules::String &rule) { - return rule.value; - }, - - [&](const rules::Metadata &rule) { - return apply(*rule.rule); - }, - - [&](const rules::Seq &rule) { - is_trivial = false; - return apply(*rule.left) + apply(*rule.right); - }, - - [&](const rules::Repeat &rule) { - is_trivial = false; - return apply(*rule.rule) + "+"; - }, - - [&](const rules::Choice &rule) { - is_trivial = false; - string result = "("; - bool started = false; - for (auto &element : rule.elements) { - if (started) result += "|"; - result += apply(element); - started = true; - } - return result + ")"; - }, - - [](auto) { - return ""; - } - ); - } - - public: - string describe(const Rule &rule) { - string result = apply(rule); - if (is_trivial) { - return result; - } else { - return "/" + result + "/"; - } - } - - TokenDescription() : is_trivial(true) {} -}; - -string token_description(const Rule &rule) { - return TokenDescription().describe(rule); -} - -} // namespace prepare_grammar -} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/token_description.h b/src/compiler/prepare_grammar/token_description.h deleted file mode 100644 index 6d83f7a9..00000000 --- a/src/compiler/prepare_grammar/token_description.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_TOKEN_DESCRIPTION_H_ -#define COMPILER_PREPARE_GRAMMAR_TOKEN_DESCRIPTION_H_ - -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace prepare_grammar { - -std::string token_description(const rules::Rule &); - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_TOKEN_DESCRIPTION_H_ diff --git a/src/compiler/property_sheet.h b/src/compiler/property_sheet.h deleted file mode 100644 index 38427d3d..00000000 --- a/src/compiler/property_sheet.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef COMPILER_PROPERTY_SHEET_H_ -#define COMPILER_PROPERTY_SHEET_H_ - -#include -#include -#include - -namespace tree_sitter { - -struct PropertySelectorStep { - std::string type; - bool named; - bool is_immediate; - int index; - std::string text_pattern; - - inline bool operator==(const PropertySelectorStep &other) const { - return - type == other.type && - named == other.named && - is_immediate == other.is_immediate && - index == other.index; - } -}; - -typedef std::vector PropertySelector; - -typedef std::map PropertySet; - -struct PropertyRule { - std::vector selectors; - PropertySet properties; -}; - -typedef std::vector PropertySheet; - -} // namespace tree_sitter - -#endif // COMPILER_PROPERTY_SHEET_H_ diff --git a/src/compiler/property_table.h b/src/compiler/property_table.h deleted file mode 100644 index 58c17d62..00000000 --- a/src/compiler/property_table.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef COMPILER_PROPERTY_TABLE_H_ -#define COMPILER_PROPERTY_TABLE_H_ - -#include -#include -#include -#include "compiler/property_sheet.h" - -namespace tree_sitter { - -struct PropertyTransition { - std::string type; - bool named; - int index; - std::string text_pattern; - unsigned state_id; - - bool operator==(const PropertyTransition &other) const { - return - type == other.type && - named == other.named && - index == other.index && - text_pattern == other.text_pattern && - state_id == other.state_id; - } -}; - -struct PropertyState { - std::vector transitions; - unsigned default_next_state_id; - unsigned property_set_id; - - bool operator==(const PropertyState &other) const { - return - transitions == other.transitions && - default_next_state_id == other.default_next_state_id && - property_set_id == other.property_set_id; - } -}; - -struct PropertyTable { - std::vector states; - std::vector property_sets; -}; - -} // namespace tree_sitter - -#endif // COMPILER_PROPERTY_TABLE_H_ diff --git a/src/compiler/rule.cc b/src/compiler/rule.cc deleted file mode 100644 index e7277459..00000000 --- a/src/compiler/rule.cc +++ /dev/null @@ -1,296 +0,0 @@ -#include "compiler/rule.h" -#include "compiler/util/hash_combine.h" - -namespace tree_sitter { -namespace rules { - -using std::move; -using std::vector; -using util::hash_combine; - -Rule::Rule(const Rule &other) : blank_(Blank{}), type(BlankType) { - *this = other; -} - -Rule::Rule(Rule &&other) noexcept : blank_(Blank{}), type(BlankType) { - *this = move(other); -} - -static void destroy_value(Rule *rule) { - switch (rule->type) { - case Rule::BlankType: return rule->blank_.~Blank(); - case Rule::CharacterSetType: return rule->character_set_.~CharacterSet(); - case Rule::StringType: return rule->string_ .~String(); - case Rule::PatternType: return rule->pattern_ .~Pattern(); - case Rule::NamedSymbolType: return rule->named_symbol_.~NamedSymbol(); - case Rule::SymbolType: return rule->symbol_ .~Symbol(); - case Rule::ChoiceType: return rule->choice_ .~Choice(); - case Rule::MetadataType: return rule->metadata_ .~Metadata(); - case Rule::RepeatType: return rule->repeat_ .~Repeat(); - case Rule::SeqType: return rule->seq_ .~Seq(); - } -} - -Rule &Rule::operator=(const Rule &other) { - destroy_value(this); - type = other.type; - switch (type) { - case BlankType: - new (&blank_) Blank(other.blank_); - break; - case CharacterSetType: - new (&character_set_) CharacterSet(other.character_set_); - break; - case StringType: - new (&string_) String(other.string_); - break; - case PatternType: - new (&pattern_) Pattern(other.pattern_); - break; - case NamedSymbolType: - new (&named_symbol_) NamedSymbol(other.named_symbol_); - break; - case SymbolType: - new (&symbol_) Symbol(other.symbol_); - break; - case ChoiceType: - new (&choice_) Choice(other.choice_); - break; - case MetadataType: - new (&metadata_) Metadata(other.metadata_); - break; - case RepeatType: - new (&repeat_) Repeat(other.repeat_); - break; - case SeqType: - new (&seq_) Seq(other.seq_); - break; - } - return *this; -} - -Rule &Rule::operator=(Rule &&other) noexcept { - destroy_value(this); - type = other.type; - switch (type) { - case BlankType: - new (&blank_) Blank(move(other.blank_)); - break; - case CharacterSetType: - new (&character_set_) CharacterSet(move(other.character_set_)); - break; - case StringType: - new (&string_) String(move(other.string_)); - break; - case PatternType: - new (&pattern_) Pattern(move(other.pattern_)); - break; - case NamedSymbolType: - new (&named_symbol_) NamedSymbol(move(other.named_symbol_)); - break; - case SymbolType: - new (&symbol_) Symbol(move(other.symbol_)); - break; - case ChoiceType: - new (&choice_) Choice(move(other.choice_)); - break; - case MetadataType: - new (&metadata_) Metadata(move(other.metadata_)); - break; - case RepeatType: - new (&repeat_) Repeat(move(other.repeat_)); - break; - case SeqType: - new (&seq_) Seq(move(other.seq_)); - break; - } - other.type = BlankType; - other.blank_ = Blank{}; - return *this; -} - -Rule::~Rule() noexcept { - destroy_value(this); -} - -bool Rule::operator==(const Rule &other) const { - if (type != other.type) return false; - switch (type) { - case Rule::CharacterSetType: return character_set_ == other.character_set_; - case Rule::StringType: return string_ == other.string_; - case Rule::PatternType: return pattern_ == other.pattern_; - case Rule::NamedSymbolType: return named_symbol_ == other.named_symbol_; - case Rule::SymbolType: return symbol_ == other.symbol_; - case Rule::ChoiceType: return choice_ == other.choice_; - case Rule::MetadataType: return metadata_ == other.metadata_; - case Rule::RepeatType: return repeat_ == other.repeat_; - case Rule::SeqType: return seq_ == other.seq_; - default: return blank_ == other.blank_; - } -} - -template <> -bool Rule::is() const { return type == BlankType; } - -template <> -bool Rule::is() const { return type == SymbolType; } - -template <> -bool Rule::is() const { return type == StringType; } - -template <> -bool Rule::is() const { return type == RepeatType; } - -template <> -bool Rule::is() const { return type == MetadataType; } - -template <> -const Symbol & Rule::get_unchecked() const { return symbol_; } - -template <> -const Metadata & Rule::get_unchecked() const { return metadata_; } - -static inline void add_choice_element(std::vector *elements, const Rule &new_rule) { - new_rule.match( - [elements](Choice choice) { - for (auto &element : choice.elements) { - add_choice_element(elements, element); - } - }, - - [elements](auto rule) { - for (auto &element : *elements) { - if (element == rule) return; - } - elements->push_back(rule); - } - ); -} - -Rule Rule::choice(const vector &rules) { - vector elements; - for (auto &element : rules) { - add_choice_element(&elements, element); - } - return (elements.size() == 1) ? elements.front() : Choice{elements}; -} - -Rule Rule::repeat(const Rule &rule) { - return rule.is() ? rule : Repeat{rule}; -} - -Rule Rule::seq(const vector &rules) { - Rule result; - for (const auto &rule : rules) { - rule.match( - [](Blank) {}, - [&](Metadata metadata) { - if (!metadata.rule->is()) { - result = Seq{result, rule}; - } - }, - [&](auto) { - if (result.is()) { - result = rule; - } else { - result = Seq{result, rule}; - } - } - ); - } - return result; -} - -} // namespace rules -} // namespace tree_sitter - -namespace std { - -size_t hash::operator()(const Symbol &symbol) const { - auto result = hash()(symbol.index); - hash_combine(&result, hash()(symbol.type)); - return result; -} - -size_t hash::operator()(const NamedSymbol &symbol) const { - return hash()(symbol.value); -} - -size_t hash::operator()(const Pattern &symbol) const { - return hash()(symbol.value); -} - -size_t hash::operator()(const String &symbol) const { - return hash()(symbol.value); -} - -size_t hash::operator()(const CharacterSet &character_set) const { - size_t result = 0; - hash_combine(&result, character_set.includes_all); - hash_combine(&result, character_set.included_chars.size()); - for (uint32_t c : character_set.included_chars) { - hash_combine(&result, c); - } - hash_combine(&result, character_set.excluded_chars.size()); - for (uint32_t c : character_set.excluded_chars) { - hash_combine(&result, c); - } - return result; -} - -size_t hash::operator()(const Blank &blank) const { - return 0; -} - -size_t hash::operator()(const Choice &choice) const { - size_t result = 0; - for (const auto &element : choice.elements) { - symmetric_hash_combine(&result, element); - } - return result; -} - -size_t hash::operator()(const Repeat &repeat) const { - size_t result = 0; - hash_combine(&result, *repeat.rule); - return result; -} - -size_t hash::operator()(const Seq &seq) const { - size_t result = 0; - hash_combine(&result, *seq.left); - hash_combine(&result, *seq.right); - return result; -} - -size_t hash::operator()(const Metadata &metadata) const { - size_t result = 0; - hash_combine(&result, *metadata.rule); - hash_combine(&result, metadata.params.precedence); - hash_combine(&result, metadata.params.associativity); - hash_combine(&result, metadata.params.has_precedence); - hash_combine(&result, metadata.params.has_associativity); - hash_combine(&result, metadata.params.is_token); - hash_combine(&result, metadata.params.is_string); - hash_combine(&result, metadata.params.is_active); - hash_combine(&result, metadata.params.is_main_token); - return result; -} - -size_t hash::operator()(const Rule &rule) const { - size_t result = hash()(rule.type); - switch (rule.type) { - case Rule::CharacterSetType: return result ^ hash()(rule.character_set_); - case Rule::StringType: return result ^ hash()(rule.string_); - case Rule::PatternType: return result ^ hash()(rule.pattern_); - case Rule::NamedSymbolType: return result ^ hash()(rule.named_symbol_); - case Rule::SymbolType: return result ^ hash()(rule.symbol_); - case Rule::ChoiceType: return result ^ hash()(rule.choice_); - case Rule::MetadataType: return result ^ hash()(rule.metadata_); - case Rule::RepeatType: return result ^ hash()(rule.repeat_); - case Rule::SeqType: return result ^ hash()(rule.seq_); - default: return result ^ hash()(rule.blank_); - } -} - -} // namespace std diff --git a/src/compiler/rule.h b/src/compiler/rule.h deleted file mode 100644 index b66e2c63..00000000 --- a/src/compiler/rule.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef COMPILER_RULE_H_ -#define COMPILER_RULE_H_ - -#include -#include -#include "compiler/util/make_visitor.h" -#include "compiler/util/hash_combine.h" -#include "compiler/rules/blank.h" -#include "compiler/rules/character_set.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/metadata.h" -#include "compiler/rules/named_symbol.h" -#include "compiler/rules/pattern.h" -#include "compiler/rules/repeat.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/string.h" -#include "compiler/rules/symbol.h" - -namespace tree_sitter { -namespace rules { - -struct Rule { - union { - Blank blank_; - CharacterSet character_set_; - String string_; - Pattern pattern_; - NamedSymbol named_symbol_; - Symbol symbol_; - Choice choice_; - Metadata metadata_; - Repeat repeat_; - Seq seq_; - }; - - enum { - BlankType, - CharacterSetType, - StringType, - PatternType, - NamedSymbolType, - SymbolType, - ChoiceType, - MetadataType, - RepeatType, - SeqType, - } type; - - Rule() : blank_(Blank{}), type(BlankType) {}; - Rule(const Blank &value) : blank_(value), type(BlankType) {}; - Rule(const CharacterSet &value) : character_set_(value), type(CharacterSetType) {}; - Rule(const String &value) : string_(value), type(StringType) {}; - Rule(const Pattern &value) : pattern_(value), type(PatternType) {}; - Rule(const NamedSymbol &value) : named_symbol_(value), type(NamedSymbolType) {}; - Rule(const Symbol &value) : symbol_(value), type(SymbolType) {}; - Rule(const Choice &value) : choice_(value), type(ChoiceType) {}; - Rule(const Metadata &value) : metadata_(value), type(MetadataType) {}; - Rule(const Repeat &value) : repeat_(value), type(RepeatType) {}; - Rule(const Seq &value) : seq_(value), type(SeqType) {}; - - Rule(const Rule &other); - Rule(Rule &&other) noexcept; - Rule &operator=(const Rule &other); - Rule &operator=(Rule &&other) noexcept; - ~Rule() noexcept; - - static Rule choice(const std::vector &rules); - static Rule seq(const std::vector &rules); - static Rule repeat(const Rule &rule); - - template - bool is() const; - - template - const RuleType & get_unchecked() const; - - template - inline auto accept(FunctionType function) const -> decltype(function(blank_)) { - switch (type) { - case CharacterSetType: return function(character_set_); - case StringType: return function(string_); - case PatternType: return function(pattern_); - case NamedSymbolType: return function(named_symbol_); - case SymbolType: return function(symbol_); - case ChoiceType: return function(choice_); - case MetadataType: return function(metadata_); - case RepeatType: return function(repeat_); - case SeqType: return function(seq_); - default: return function(blank_); - } - } - - template - inline auto match(FunctionTypes && ...functions) const -> decltype(accept(util::make_visitor(std::forward(functions)...))) { - return accept(util::make_visitor(std::forward(functions)...)); - } - - bool operator==(const Rule &other) const; -}; - -} // namespace rules -} // namespace tree_sitter - -namespace std { - -using namespace tree_sitter::rules; -using namespace tree_sitter::util; - -template <> -struct hash { size_t operator()(const Symbol &) const; }; - -template <> -struct hash { size_t operator()(const NamedSymbol &) const; }; - -template <> -struct hash { size_t operator()(const Pattern &) const; }; - -template <> -struct hash { size_t operator()(const String &) const; }; - -template <> -struct hash { size_t operator()(const CharacterSet &) const; }; - -template <> -struct hash { size_t operator()(const Blank &) const; }; - -template <> -struct hash { size_t operator()(const Choice &) const; }; - -template <> -struct hash { size_t operator()(const Repeat &) const; }; - -template <> -struct hash { size_t operator()(const Seq &) const; }; - -template <> -struct hash { size_t operator()(const Metadata &) const; }; - -template <> -struct hash { size_t operator()(const Rule &) const; }; - -} // namespace std - -#endif // COMPILER_RULE_H_ diff --git a/src/compiler/rules/blank.h b/src/compiler/rules/blank.h deleted file mode 100644 index aa7ed5c9..00000000 --- a/src/compiler/rules/blank.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef COMPILER_RULES_BLANK_H_ -#define COMPILER_RULES_BLANK_H_ - -namespace tree_sitter { -namespace rules { - -struct Blank { - inline bool operator==(const Blank &other) const { - return true; - } -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_BLANK_H_ \ No newline at end of file diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc deleted file mode 100644 index b0064cbb..00000000 --- a/src/compiler/rules/character_set.cc +++ /dev/null @@ -1,176 +0,0 @@ -#include "compiler/rules/character_set.h" - -using std::set; -using std::vector; - -namespace tree_sitter { -namespace rules { - -static void add_range(set *characters, uint32_t min, uint32_t max) { - for (uint32_t c = min; c <= max; c++) { - characters->insert(c); - } -} - -static void remove_range(set *characters, uint32_t min, uint32_t max) { - for (uint32_t c = min; c <= max; c++) { - characters->erase(c); - } -} - -static set remove_chars(set *left, const set &right) { - set result; - for (uint32_t c : right) { - if (left->erase(c)) { - result.insert(c); - } - } - return result; -} - -static set add_chars(set *left, const set &right) { - set result; - for (uint32_t c : right) { - if (left->insert(c).second) { - result.insert(c); - } - } - return result; -} - -static vector consolidate_ranges(const set &characters) { - vector result; - for (uint32_t c : characters) { - if (!result.empty() && result.back().max == c - 1) { - result.back().max = c; - } else { - result.push_back(CharacterRange(c)); - } - } - return result; -} - -CharacterSet::CharacterSet() : includes_all(false) {} - -CharacterSet::CharacterSet(const set &chars) : included_chars(chars), includes_all(false) {} - -bool CharacterSet::operator==(const CharacterSet &other) const { - return includes_all == other.includes_all && - included_chars == other.included_chars && - excluded_chars == other.excluded_chars; -} - -bool CharacterSet::operator<(const CharacterSet &other) const { - if (!includes_all && other.includes_all) return true; - if (includes_all && !other.includes_all) return false; - if (includes_all) { - if (excluded_chars.size() > other.excluded_chars.size()) return true; - if (excluded_chars.size() < other.excluded_chars.size()) return false; - return excluded_chars < other.excluded_chars; - } else { - if (included_chars.size() < other.included_chars.size()) return true; - if (included_chars.size() > other.included_chars.size()) return false; - return included_chars < other.included_chars; - } -} - -CharacterSet &CharacterSet::include_all() { - includes_all = true; - included_chars = {}; - excluded_chars = { 0 }; - return *this; -} - -CharacterSet &CharacterSet::include(uint32_t min, uint32_t max) { - if (includes_all) - remove_range(&excluded_chars, min, max); - else - add_range(&included_chars, min, max); - return *this; -} - -CharacterSet &CharacterSet::exclude(uint32_t min, uint32_t max) { - if (includes_all) - add_range(&excluded_chars, min, max); - else - remove_range(&included_chars, min, max); - return *this; -} - -CharacterSet &CharacterSet::include(uint32_t c) { - return include(c, c); -} - -CharacterSet &CharacterSet::exclude(uint32_t c) { - return exclude(c, c); -} - -bool CharacterSet::is_empty() const { - return !includes_all && included_chars.empty(); -} - -void CharacterSet::add_set(const CharacterSet &other) { - if (includes_all) { - if (other.includes_all) { - excluded_chars = remove_chars(&excluded_chars, other.excluded_chars); - } else { - remove_chars(&excluded_chars, other.included_chars); - } - } else { - if (other.includes_all) { - includes_all = true; - for (uint32_t c : other.excluded_chars) - if (!included_chars.count(c)) - excluded_chars.insert(c); - included_chars.clear(); - } else { - included_chars.insert(other.included_chars.begin(), other.included_chars.end()); - } - } -} - -CharacterSet CharacterSet::remove_set(const CharacterSet &other) { - CharacterSet result; - if (includes_all) { - if (other.includes_all) { - result.includes_all = true; - result.excluded_chars = excluded_chars; - included_chars = add_chars(&result.excluded_chars, other.excluded_chars); - excluded_chars = {}; - includes_all = false; - } else { - result.included_chars = add_chars(&excluded_chars, other.included_chars); - } - } else { - if (other.includes_all) { - result.included_chars = included_chars; - included_chars = - remove_chars(&result.included_chars, other.excluded_chars); - } else { - result.included_chars = - remove_chars(&included_chars, other.included_chars); - } - } - return result; -} - -bool CharacterSet::intersects(const CharacterSet &other) const { - CharacterSet copy(*this); - return !copy.remove_set(other).is_empty(); -} - -CharacterSet CharacterSet::intersection(const CharacterSet &other) const { - CharacterSet copy(*this); - return copy.remove_set(other); -} - -vector CharacterSet::included_ranges() const { - return consolidate_ranges(included_chars); -} - -vector CharacterSet::excluded_ranges() const { - return consolidate_ranges(excluded_chars); -} - -} // namespace rules -} // namespace tree_sitter diff --git a/src/compiler/rules/character_set.h b/src/compiler/rules/character_set.h deleted file mode 100644 index c49b0d1d..00000000 --- a/src/compiler/rules/character_set.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef COMPILER_RULES_CHARACTER_SET_H_ -#define COMPILER_RULES_CHARACTER_SET_H_ - -#include -#include -#include - -namespace tree_sitter { -namespace rules { - -struct CharacterRange { - uint32_t min; - uint32_t max; - - inline explicit CharacterRange(uint32_t value) : min{value}, max{value} {} - inline CharacterRange(uint32_t min, uint32_t max) : min{min}, max{max} {} - - inline bool operator==(const CharacterRange &other) const { - return min == other.min && max == other.max; - } -}; - -struct CharacterSet { - CharacterSet(); - CharacterSet(const std::set &); - - CharacterSet &include_all(); - CharacterSet &include(uint32_t c); - CharacterSet &include(uint32_t min, uint32_t max); - CharacterSet &exclude(uint32_t c); - CharacterSet &exclude(uint32_t min, uint32_t max); - - bool operator==(const CharacterSet &) const; - bool operator<(const CharacterSet &) const; - - void add_set(const CharacterSet &other); - CharacterSet remove_set(const CharacterSet &other); - CharacterSet intersection(const CharacterSet &other) const; - bool intersects(const CharacterSet &other) const; - bool is_empty() const; - - std::vector included_ranges() const; - std::vector excluded_ranges() const; - - std::set included_chars; - std::set excluded_chars; - bool includes_all; -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_CHARACTER_SET_H_ diff --git a/src/compiler/rules/choice.cc b/src/compiler/rules/choice.cc deleted file mode 100644 index 1b3be56c..00000000 --- a/src/compiler/rules/choice.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "compiler/rules/choice.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace rules { - -bool Choice::operator==(const Choice &other) const { - return elements == other.elements; -} - -} // namespace rules -} // namespace tree_sitter diff --git a/src/compiler/rules/choice.h b/src/compiler/rules/choice.h deleted file mode 100644 index 6365a565..00000000 --- a/src/compiler/rules/choice.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef COMPILER_RULES_CHOICE_H_ -#define COMPILER_RULES_CHOICE_H_ - -#include -#include - -namespace tree_sitter { -namespace rules { - -struct Rule; - -struct Choice { - std::vector elements; - - bool operator==(const Choice &other) const; -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_CHOICE_H_ \ No newline at end of file diff --git a/src/compiler/rules/metadata.cc b/src/compiler/rules/metadata.cc deleted file mode 100644 index c54d29cd..00000000 --- a/src/compiler/rules/metadata.cc +++ /dev/null @@ -1,162 +0,0 @@ -#include "compiler/rules/metadata.h" -#include -#include -#include "compiler/rule.h" - -namespace tree_sitter { -namespace rules { - -using std::move; -using std::string; - -bool Alias::operator==(const Alias &other) const { - return value == other.value && is_named == other.is_named; -} - -bool Alias::operator!=(const Alias &other) const { - return !operator==(other); -} - -bool Alias::operator<(const Alias &other) const { - if (value < other.value) return true; - if (other.value < value) return false; - return is_named < other.is_named; -} - -Metadata::Metadata(const Rule &rule, MetadataParams params) : - rule(std::make_shared(rule)), params(params) {} - -bool Metadata::operator==(const Metadata &other) const { - return rule->operator==(*other.rule) && params == other.params; -} - -template -static Metadata add_metadata(Rule &&rule, T &&callback) { - if (rule.is()) { - Metadata metadata = rule.get_unchecked(); - callback(metadata.params); - return metadata; - } else { - MetadataParams params; - callback(params); - return Metadata{move(rule), params}; - } -} - -Metadata Metadata::merge(Rule &&rule, MetadataParams new_params) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - if (new_params.has_precedence && !params.has_precedence) { - params.has_precedence = true; - params.precedence = new_params.precedence; - } - - if (new_params.has_associativity && !params.has_associativity) { - params.has_associativity = true; - params.associativity = new_params.associativity; - } - - if (new_params.dynamic_precedence != 0) { - params.dynamic_precedence = new_params.dynamic_precedence; - } - - if (new_params.is_string) params.is_string = true; - if (new_params.is_active) params.is_active = true; - if (new_params.is_main_token) params.is_main_token = true; - - if (!new_params.alias.value.empty()) { - params.alias = new_params.alias; - } - }); -} - -Metadata Metadata::token(Rule &&rule) { - return add_metadata(move(rule), [](MetadataParams ¶ms) { - params.is_token = true; - }); -} - -Metadata Metadata::immediate_token(Rule &&rule) { - return add_metadata(move(rule), [](MetadataParams ¶ms) { - params.is_token = true; - params.is_main_token = true; - }); -} - -Metadata Metadata::active_prec(int precedence, Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - params.has_precedence = true; - params.precedence = precedence; - params.is_active = true; - }); -} - -Metadata Metadata::prec(int precedence, Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - if (!params.has_precedence) { - params.has_precedence = true; - params.precedence = precedence; - } - }); -} - -Metadata Metadata::prec_left(int precedence, Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - if (!params.has_precedence) { - params.has_precedence = true; - params.precedence = precedence; - } - if (!params.has_associativity) { - params.has_associativity = true; - params.associativity = AssociativityLeft; - } - }); -} - -Metadata Metadata::prec_right(int precedence, Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - if (!params.has_precedence) { - params.has_precedence = true; - params.precedence = precedence; - } - if (!params.has_associativity) { - params.has_associativity = true; - params.associativity = AssociativityRight; - } - }); -} - -Metadata Metadata::prec_dynamic(int dynamic_precedence, Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - params.dynamic_precedence = dynamic_precedence; - }); -} - -Metadata Metadata::separator(Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - if (!params.has_precedence) { - params.has_precedence = true; - params.precedence = INT_MIN; - } - params.is_active = true; - }); -} - -Metadata Metadata::main_token(Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - if (!params.has_precedence) { - params.has_precedence = true; - params.precedence = 0; - } - params.is_main_token = true; - }); -} - -Metadata Metadata::alias(string &&value, bool is_named, Rule &&rule) { - return add_metadata(move(rule), [&](MetadataParams ¶ms) { - params.alias.value = move(value); - params.alias.is_named = is_named; - }); -} - -} // namespace rules -} // namespace tree_sitter diff --git a/src/compiler/rules/metadata.h b/src/compiler/rules/metadata.h deleted file mode 100644 index 3c023b3e..00000000 --- a/src/compiler/rules/metadata.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef COMPILER_RULES_METADATA_H_ -#define COMPILER_RULES_METADATA_H_ - -#include -#include - -namespace tree_sitter { -namespace rules { - -enum Associativity { - AssociativityNone, - AssociativityLeft, - AssociativityRight, -}; - -struct Alias { - std::string value = ""; - bool is_named = false; - bool operator==(const Alias &) const; - bool operator!=(const Alias &) const; - bool operator<(const Alias &) const; -}; - -struct MetadataParams { - int precedence; - int dynamic_precedence; - Associativity associativity; - bool has_precedence; - bool has_associativity; - bool is_token; - bool is_string; - bool is_active; - bool is_main_token; - Alias alias; - - inline MetadataParams() : - precedence{0}, dynamic_precedence{0}, associativity{AssociativityNone}, - has_precedence{false}, has_associativity{false}, is_token{false}, is_string{false}, - is_active{false}, is_main_token{false} {} - - inline bool operator==(const MetadataParams &other) const { - return ( - precedence == other.precedence && - associativity == other.associativity && - has_precedence == other.has_precedence && - has_associativity == other.has_associativity && - dynamic_precedence == other.dynamic_precedence && - is_token == other.is_token && - is_string == other.is_string && - is_active == other.is_active && - is_main_token == other.is_main_token && - alias == other.alias - ); - } -}; - -struct Rule; - -struct Metadata { - std::shared_ptr rule; - MetadataParams params; - - Metadata(const Rule &rule, MetadataParams params); - - static Metadata merge(Rule &&rule, MetadataParams params); - static Metadata token(Rule &&rule); - static Metadata immediate_token(Rule &&rule); - static Metadata active_prec(int precedence, Rule &&rule); - static Metadata prec(int precedence, Rule &&rule); - static Metadata prec_left(int precedence, Rule &&rule); - static Metadata prec_right(int precedence, Rule &&rule); - static Metadata prec_dynamic(int precedence, Rule &&rule); - static Metadata separator(Rule &&rule); - static Metadata main_token(Rule &&rule); - static Metadata alias(std::string &&value, bool is_named, Rule &&rule); - - bool operator==(const Metadata &other) const; -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_METADATA_H_ diff --git a/src/compiler/rules/named_symbol.h b/src/compiler/rules/named_symbol.h deleted file mode 100644 index dd668aea..00000000 --- a/src/compiler/rules/named_symbol.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef COMPILER_RULES_NAMED_SYMBOL_H_ -#define COMPILER_RULES_NAMED_SYMBOL_H_ - -#include - -namespace tree_sitter { -namespace rules { - -struct NamedSymbol { - std::string value; - - inline bool operator==(const NamedSymbol &other) const { - return value == other.value; - } -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_NAMED_SYMBOL_H_ \ No newline at end of file diff --git a/src/compiler/rules/pattern.h b/src/compiler/rules/pattern.h deleted file mode 100644 index 60c773f6..00000000 --- a/src/compiler/rules/pattern.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef COMPILER_RULES_PATTERN_H_ -#define COMPILER_RULES_PATTERN_H_ - -#include - -namespace tree_sitter { -namespace rules { - -struct Pattern { - std::string value; - - inline bool operator==(const Pattern &other) const { - return value == other.value; - } -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_PATTERN_H_ \ No newline at end of file diff --git a/src/compiler/rules/repeat.cc b/src/compiler/rules/repeat.cc deleted file mode 100644 index 87cc19cd..00000000 --- a/src/compiler/rules/repeat.cc +++ /dev/null @@ -1,15 +0,0 @@ -#include "compiler/rules/repeat.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace rules { - -Repeat::Repeat(const Rule &rule) : - rule(std::make_shared(rule)) {} - -bool Repeat::operator==(const Repeat &other) const { - return rule->operator==(*other.rule); -} - -} // namespace rules -} // namespace tree_sitter diff --git a/src/compiler/rules/repeat.h b/src/compiler/rules/repeat.h deleted file mode 100644 index b9770a51..00000000 --- a/src/compiler/rules/repeat.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef COMPILER_RULES_REPEAT_H_ -#define COMPILER_RULES_REPEAT_H_ - -#include - -namespace tree_sitter { -namespace rules { - -struct Rule; - -struct Repeat { - std::shared_ptr rule; - - explicit Repeat(const Rule &rule); - bool operator==(const Repeat &other) const; -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_REPEAT_H_ \ No newline at end of file diff --git a/src/compiler/rules/seq.cc b/src/compiler/rules/seq.cc deleted file mode 100644 index cf898e0e..00000000 --- a/src/compiler/rules/seq.cc +++ /dev/null @@ -1,16 +0,0 @@ -#include "compiler/rules/seq.h" -#include "compiler/rule.h" - -namespace tree_sitter { -namespace rules { - -Seq::Seq(const Rule &left, const Rule &right) : - left(std::make_shared(left)), - right(std::make_shared(right)) {} - -bool Seq::operator==(const Seq &other) const { - return left->operator==(*other.left) && right->operator==(*other.right); -} - -} // namespace rules -} // namespace tree_sitter diff --git a/src/compiler/rules/seq.h b/src/compiler/rules/seq.h deleted file mode 100644 index 6ddbb003..00000000 --- a/src/compiler/rules/seq.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef COMPILER_RULES_SEQ_H_ -#define COMPILER_RULES_SEQ_H_ - -#include -#include - -namespace tree_sitter { -namespace rules { - -struct Rule; - -struct Seq { - std::shared_ptr left; - std::shared_ptr right; - - Seq(const Rule &left, const Rule &right); - bool operator==(const Seq &other) const; -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_SEQ_H_ \ No newline at end of file diff --git a/src/compiler/rules/string.h b/src/compiler/rules/string.h deleted file mode 100644 index 9fbacd34..00000000 --- a/src/compiler/rules/string.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef COMPILER_RULES_STRING_H_ -#define COMPILER_RULES_STRING_H_ - -#include - -namespace tree_sitter { -namespace rules { - -struct String { - std::string value; - - inline bool operator==(const String &other) const { - return value == other.value; - } -}; - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_STRING_H_ \ No newline at end of file diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h deleted file mode 100644 index adfe5f2c..00000000 --- a/src/compiler/rules/symbol.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef COMPILER_RULES_SYMBOL_H_ -#define COMPILER_RULES_SYMBOL_H_ - -namespace tree_sitter { -namespace rules { - -struct Symbol { - using Index = int; - enum Type { - External, - Terminal, - NonTerminal, - }; - - inline bool operator==(const Symbol &other) const { - return index == other.index && type == other.type; - } - - inline bool operator!=(const Symbol &other) const { - return !operator==(other); - } - - inline bool operator<(const Symbol &other) const { - if (type < other.type) return true; - if (type > other.type) return false; - return index < other.index; - } - - Index index; - Type type; - - static Symbol terminal(Index index) { - return Symbol{index, Type::Terminal}; - } - - static Symbol external(Index index) { - return Symbol{index, Type::External}; - } - - static Symbol non_terminal(Index index) { - return Symbol{index, Type::NonTerminal}; - } - - bool is_non_terminal() const { - return type == Type::NonTerminal; - } - - bool is_terminal() const { - return type == Type::Terminal; - } - - bool is_external() const { - return type == Type::External; - } - - bool is_built_in() const { - return index < 0; - } -}; - -inline Symbol END_OF_INPUT() { - return Symbol{-1, Symbol::Terminal}; -} - -inline Symbol START() { - return Symbol{-2, Symbol::NonTerminal}; -} - -inline Symbol NONE() { - return Symbol{-3, Symbol::Type(-1)}; -} - -} // namespace rules -} // namespace tree_sitter - -#endif // COMPILER_RULES_SYMBOL_H_ diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc deleted file mode 100644 index 4c75173a..00000000 --- a/src/compiler/syntax_grammar.cc +++ /dev/null @@ -1,36 +0,0 @@ -#include "compiler/syntax_grammar.h" - -namespace tree_sitter { - -bool ProductionStep::operator==(const ProductionStep &other) const { - return symbol == other.symbol && - precedence == other.precedence && - associativity == other.associativity && - alias == other.alias; -} - -bool ProductionStep::operator!=(const ProductionStep &other) const { - return !operator==(other); -} - -bool ProductionStep::operator<(const ProductionStep &other) const { - if (symbol < other.symbol) return true; - if (other.symbol < symbol) return false; - if (precedence < other.precedence) return true; - if (other.precedence < precedence) return false; - if (associativity < other.associativity) return true; - if (other.associativity < associativity) return false; - return alias < other.alias; -} - -bool Production::operator==(const Production &other) const { - return steps == other.steps && dynamic_precedence == other.dynamic_precedence; -} - -bool ExternalToken::operator==(const ExternalToken &other) const { - return name == other.name && - type == other.type && - corresponding_internal_token == other.corresponding_internal_token; -} - -} // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h deleted file mode 100644 index 182bed3d..00000000 --- a/src/compiler/syntax_grammar.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef COMPILER_SYNTAX_GRAMMAR_H_ -#define COMPILER_SYNTAX_GRAMMAR_H_ - -#include -#include -#include -#include "compiler/rule.h" -#include "compiler/grammar.h" - -namespace tree_sitter { - -struct ProductionStep { - rules::Symbol symbol; - int precedence; - rules::Associativity associativity; - rules::Alias alias; - - bool operator==(const ProductionStep &) const; - bool operator!=(const ProductionStep &) const; - bool operator<(const ProductionStep &) const; -}; - -struct Production { - std::vector steps; - int dynamic_precedence; - - inline Production() : dynamic_precedence(0) {} - - inline Production(std::vector &&steps, int dynamic_precedence = 0) : - steps(move(steps)), dynamic_precedence(dynamic_precedence) {} - - bool operator==(const Production &) const; - inline ProductionStep &back() { return steps.back(); } - inline const ProductionStep &back() const { return steps.back(); } - inline bool empty() const { return steps.empty(); } - inline size_t size() const { return steps.size(); } - inline const ProductionStep &operator[](int i) const { return steps[i]; } - inline const ProductionStep &at(int i) const { return steps[i]; } - inline std::vector::const_iterator begin() const { return steps.begin(); } - inline std::vector::const_iterator end() const { return steps.end(); } -}; - -struct SyntaxVariable { - std::string name; - VariableType type; - std::vector productions; -}; - -struct ExternalToken { - std::string name; - VariableType type; - rules::Symbol corresponding_internal_token; - - bool operator==(const ExternalToken &) const; -}; - -struct SyntaxGrammar { - std::vector variables; - std::set extra_tokens; - std::set> expected_conflicts; - std::vector external_tokens; - std::set variables_to_inline; - rules::Symbol word_token; -}; - -} // namespace tree_sitter - -#endif // COMPILER_SYNTAX_GRAMMAR_H_ diff --git a/src/compiler/util/hash_combine.h b/src/compiler/util/hash_combine.h deleted file mode 100644 index 9cc3ad17..00000000 --- a/src/compiler/util/hash_combine.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef COMPILER_UTIL_HASH_COMBINE_H_ -#define COMPILER_UTIL_HASH_COMBINE_H_ - -#include - -namespace tree_sitter { -namespace util { - -template -inline void hash_combine(std::size_t *seed, const T &new_value) { - std::hash hasher; - *seed ^= hasher(new_value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2); -} - -template -inline void symmetric_hash_combine(std::size_t *seed, const T &new_value) { - std::hash hasher; - *seed ^= hasher(new_value); -} - -} // namespace util -} // namespace tree_sitter - -#endif // COMPILER_UTIL_HASH_COMBINE_H_ diff --git a/src/compiler/util/make_visitor.h b/src/compiler/util/make_visitor.h deleted file mode 100644 index 6de51dc4..00000000 --- a/src/compiler/util/make_visitor.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef COMPILER_UTIL_MAKE_VISITOR_H_ -#define COMPILER_UTIL_MAKE_VISITOR_H_ - -namespace tree_sitter { -namespace util { - -template -struct visitor; - -template -struct visitor : Fn { - using Fn::operator(); - visitor(Fn fn) : Fn(fn) {} -}; - -template -struct visitor : Fn, visitor { - using Fn::operator(); - using visitor::operator(); - visitor(Fn fn, Fns... fns) : Fn(fn), visitor(fns...) {} -}; - -template -visitor make_visitor(Fns... fns) { - return visitor(fns...); -} - -} // namespace util -} // namespace tree_sitter - -#endif // COMPILER_UTIL_MAKE_VISITOR_H_ diff --git a/src/compiler/util/result.h b/src/compiler/util/result.h deleted file mode 100644 index 448ad1af..00000000 --- a/src/compiler/util/result.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef COMPILER_UTIL_RESULT_H_ -#define COMPILER_UTIL_RESULT_H_ - -#include - -namespace tree_sitter { -namespace util { - -template -struct Result { - Value value; - std::string error; - - inline Result() : error("Empty") {} - inline Result(const Value &v) : value(v) {} - inline Result(Value &&v) : value(std::move(v)) {} - inline Result(const std::string &message) : error(message) {} - inline Result(const char *message) : error(message) {} - inline bool ok() const { return error.empty(); } -}; - -} // namespace util -} // namespace tree_sitter - -#endif // COMPILER_UTIL_RESULT_H_ diff --git a/src/compiler/util/string_helpers.cc b/src/compiler/util/string_helpers.cc deleted file mode 100644 index df1f00b4..00000000 --- a/src/compiler/util/string_helpers.cc +++ /dev/null @@ -1,54 +0,0 @@ -#include "compiler/util/string_helpers.h" -#include - -namespace tree_sitter { -namespace util { - -using std::string; -using std::vector; -using std::set; -using std::to_string; - -void str_replace(string *input, const string &search, const string &replace) { - size_t pos = 0; - while (1) { - pos = input->find(search, pos); - if (pos == string::npos) - break; - input->erase(pos, search.length()); - input->insert(pos, replace); - pos += replace.length(); - } -} - -string escape_string(string input) { - str_replace(&input, "\"", "\\\""); - str_replace(&input, "\n", "\\n"); - return input; -} - -string escape_char(uint32_t character) { - switch (character) { - case '"': - return "'\\\"'"; - case '\'': - return "'\\''"; - case '\n': - return "'\\n'"; - case '\r': - return "'\\r'"; - case '\t': - return "'\\t'"; - case '\\': - return "'\\\\'"; - default: - if (character >= ' ' && character <= '~') { - return string("'") + static_cast(character) + "'"; - } else { - return to_string(character); - } - } -} - -} // namespace util -} // namespace tree_sitter diff --git a/src/compiler/util/string_helpers.h b/src/compiler/util/string_helpers.h deleted file mode 100644 index f9fed146..00000000 --- a/src/compiler/util/string_helpers.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef COMPILER_UTIL_STRING_HELPERS_H_ -#define COMPILER_UTIL_STRING_HELPERS_H_ - -#include -#include -#include - -namespace tree_sitter { -namespace util { - -void str_replace(std::string *input, const std::string &search, - const std::string &replace); -std::string escape_string(std::string input); -std::string escape_char(uint32_t character); - -} // namespace util -} // namespace tree_sitter - -#endif // COMPILER_UTIL_STRING_HELPERS_H_ diff --git a/test/benchmarks.cc b/test/benchmarks.cc deleted file mode 100644 index 6612444e..00000000 --- a/test/benchmarks.cc +++ /dev/null @@ -1,119 +0,0 @@ -#include -#include -#include -#include -#include -#include "tree_sitter/runtime.h" -#include "helpers/load_language.h" -#include "helpers/stderr_logger.h" -#include "helpers/read_test_entries.h" - -using std::map; -using std::vector; -using std::string; - -vector language_names({ - "c", - "cpp", - "javascript", - "python", - "bash", -}); - -size_t mean(const vector &values) { - if (values.empty()) return 0; - size_t result = 0; - for (size_t value : values) { - result += value; - } - return result / values.size(); -} - -size_t min(const vector &values) { - size_t result = 0; - for (unsigned i = 0; i < values.size(); i++) { - size_t value = values[i]; - if (i == 0 || value < result) result = value; - } - return result; -} - -int main(int argc, char *arg[]) { - map> example_entries_by_language_name; - vector error_speeds; - vector non_error_speeds; - - TSParser *parser = ts_parser_new(); - - if (getenv("TREE_SITTER_BENCHMARK_SVG")) { - ts_parser_print_dot_graphs(parser, stderr); - } else if (getenv("TREE_SITTER_BENCHMARK_LOG")) { - ts_parser_set_logger(parser, stderr_logger_new(false)); - } - - auto language_filter = getenv("TREE_SITTER_BENCHMARK_LANGUAGE"); - auto file_name_filter = getenv("TREE_SITTER_BENCHMARK_FILE_NAME"); - - for (auto &language_name : language_names) { - example_entries_by_language_name[language_name] = examples_for_language(language_name); - } - - for (auto &language_name : language_names) { - if (language_filter && language_name != language_filter) continue; - - ts_parser_set_language(parser, load_real_language(language_name)); - - printf("%s\n", language_name.c_str()); - - for (auto &example : example_entries_by_language_name[language_name]) { - if (file_name_filter && example.file_name != file_name_filter) continue; - if (example.input.size() < 256) continue; - - clock_t start_time = clock(); - TSTree *tree = ts_parser_parse_string(parser, nullptr, example.input.c_str(), example.input.size()); - clock_t end_time = clock(); - - assert(!ts_node_has_error(ts_tree_root_node(tree))); - ts_tree_delete(tree); - - size_t duration = (end_time - start_time) * 1000 / CLOCKS_PER_SEC; - size_t speed = static_cast(example.input.size()) / duration; - printf(" %-30s\t%lu ms\t\t%lu bytes/ms\n", example.file_name.c_str(), duration, speed); - if (speed != 0) non_error_speeds.push_back(speed); - } - - for (auto &other_language_name : language_names) { - if (other_language_name == language_name) continue; - - for (auto &example : example_entries_by_language_name[other_language_name]) { - if (file_name_filter && example.file_name != file_name_filter) continue; - if (example.input.size() < 256) continue; - - clock_t start_time = clock(); - TSTree *tree = ts_parser_parse_string(parser, nullptr, example.input.c_str(), example.input.size()); - clock_t end_time = clock(); - - ts_tree_delete(tree); - - size_t duration = (end_time - start_time) * 1000 / CLOCKS_PER_SEC; - size_t speed = static_cast(example.input.size()) / duration; - printf(" %-30s\t%lu ms\t\t%lu bytes/ms\n", example.file_name.c_str(), duration, speed); - if (speed != 0) error_speeds.push_back(speed); - } - } - - puts(""); - } - - puts("without errors:"); - printf(" %-30s\t%lu bytes/ms\n", "average speed", mean(non_error_speeds)); - printf(" %-30s\t%lu bytes/ms\n", "worst speed", min(non_error_speeds)); - puts(""); - - puts("with errors:"); - printf(" %-30s\t%lu bytes/ms\n", "average speed", mean(error_speeds)); - printf(" %-30s\t%lu bytes/ms\n", "worst speed", min(error_speeds)); - - ts_parser_delete(parser); - return 0; -} diff --git a/test/compiler/build_tables/lex_item_test.cc b/test/compiler/build_tables/lex_item_test.cc deleted file mode 100644 index 71d5555f..00000000 --- a/test/compiler/build_tables/lex_item_test.cc +++ /dev/null @@ -1,515 +0,0 @@ -#include "test_helper.h" -#include "compiler/build_tables/lex_item.h" -#include "compiler/rule.h" -#include "helpers/stream_methods.h" - -using namespace rules; -using namespace build_tables; -typedef LexItemSet::Transition Transition; - -START_TEST - -describe("LexItem", []() { - describe("completion_status()", [&]() { - it("indicates whether the item is done and its precedence", [&]() { - LexItem item1(Symbol::terminal(0), CharacterSet({'a', 'b', 'c'})); - AssertThat(item1.completion_status().is_done, IsFalse()); - AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange())); - - MetadataParams params; - params.precedence = 3; - params.has_precedence = true; - params.is_string = 1; - LexItem item2(Symbol::terminal(0), Choice{{ - Metadata{Blank{}, params}, - CharacterSet({'a', 'b', 'c'}), - }}); - - AssertThat(item2.completion_status().is_done, IsTrue()); - AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3))); - - LexItem item3(Symbol::terminal(0), Choice{{ - Blank{}, - Repeat{CharacterSet({' ', '\t'})}, - }}); - AssertThat(item3.completion_status().is_done, IsTrue()); - AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange())); - }); - }); -}); - -describe("LexItemSet::transitions()", [&]() { - it("handles single characters", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), CharacterSet({'x'})), - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet{{'x'}}, - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Blank{}), - }), - PrecedenceRange(), - false - } - } - }))); - }); - - it("marks transitions that are within the main token (as opposed to separators)", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Metadata::main_token(CharacterSet{{'x'}})), - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'x'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Metadata::active_prec(0, Metadata::main_token(Blank{}))), - }), - PrecedenceRange(), - true - } - } - }))); - }); - - it("handles sequences", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - CharacterSet({'w'}), - CharacterSet({'x'}), - CharacterSet({'y'}), - CharacterSet({'z'}), - })), - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'w'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - CharacterSet({'x'}), - CharacterSet({'y'}), - CharacterSet({'z'}), - })), - }), - PrecedenceRange(), - false - } - } - }))); - }); - - it("handles sequences with nested precedence", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - Metadata::prec(3, Rule::seq({ - CharacterSet({'v'}), - Metadata::prec(4, Rule::seq({ - CharacterSet({'w'}), - CharacterSet({'x'}) })), - CharacterSet({'y'}) })), - CharacterSet({'z'}), - })), - }); - - auto transitions = item_set.transitions(); - - AssertThat( - transitions, - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'v'}), - Transition{ - // The outer precedence is now 'active', because we are within its - // contained rule. - LexItemSet({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - Metadata::active_prec(3, Rule::seq({ - Metadata::prec(4, Rule::seq({ - CharacterSet({'w'}), - CharacterSet({'x'}) - })), - CharacterSet({'y'}) - })), - CharacterSet({'z'}), - })), - }), - - // No precedence is applied upon entering a rule. - PrecedenceRange(), - false - } - } - }))); - - LexItemSet item_set2 = transitions[CharacterSet({'v'})].destination; - transitions = item_set2.transitions(); - - AssertThat( - transitions, - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'w'}), - Transition{ - // The inner precedence is now 'active' - LexItemSet({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - Metadata::active_prec(3, Rule::seq({ - Metadata::active_prec(4, CharacterSet{{'x'}}), - CharacterSet{{'y'}} - })), - CharacterSet{{'z'}}, - })), - }), - - // The outer precedence is applied. - PrecedenceRange(3), - false - } - } - }))); - - LexItemSet item_set3 = transitions[CharacterSet({'w'})].destination; - transitions = item_set3.transitions(); - - AssertThat( - transitions, - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'x'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - Metadata::active_prec(3, CharacterSet{{'y'}}), - CharacterSet{{'z'}}, - })), - }), - - // The inner precedence is applied. - PrecedenceRange(4), - false - } - } - }))); - - LexItemSet item_set4 = transitions[CharacterSet({'x'})].destination; - transitions = item_set4.transitions(); - - AssertThat( - transitions, - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'y'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), CharacterSet({'z'})), - }), - PrecedenceRange(3), - false - } - } - }))); - }); - - it("handles sequences where the left hand side can be blank", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - Rule::choice({ - CharacterSet({'x'}), - Blank{}, - }), - CharacterSet({'y'}), - CharacterSet({'z'}), - })), - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'x'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - CharacterSet({'y'}), - CharacterSet({'z'}), - })), - }), - PrecedenceRange(), - false - } - }, - { - CharacterSet({'y'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), CharacterSet({'z'})), - }), - PrecedenceRange(), - false - } - } - }))); - }); - - it("handles blanks", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Blank{}), - }); - - AssertThat(item_set.transitions(), IsEmpty()); - }); - - it("handles repeats", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Repeat{Rule::seq({ - CharacterSet({'a'}), - CharacterSet({'b'}), - })}), - LexItem(Symbol::non_terminal(2), Repeat{CharacterSet{{'c'}}}), - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'a'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - CharacterSet({'b'}), - Repeat{Rule::seq({ - CharacterSet({'a'}), - CharacterSet({'b'}), - })} - })), - LexItem(Symbol::non_terminal(1), CharacterSet({'b'})), - }), - PrecedenceRange(), - false - } - }, - { - CharacterSet({'c'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(2), Repeat{CharacterSet({'c'})}), - LexItem(Symbol::non_terminal(2), Blank{}), - }), - PrecedenceRange(), - false - } - } - }))); - }); - - it("handles repeats with precedence", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Metadata::active_prec(-1, Repeat{CharacterSet({'a'})})) - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'a'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Metadata::active_prec(-1, Repeat{CharacterSet({'a'})})), - LexItem(Symbol::non_terminal(1), Metadata::active_prec(-1, Blank{})), - }), - PrecedenceRange(-1), - false - } - } - }))); - }); - - it("handles choices between overlapping character sets", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Rule::choice({ - Metadata::active_prec(2, Rule::seq({ - CharacterSet({'a', 'b', 'c', 'd'}), - CharacterSet({'x'}), - })), - Metadata::active_prec(3, Rule::seq({ - CharacterSet({'c', 'd', 'e', 'f'}), - CharacterSet({'y'}), - })), - })) - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'a', 'b'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Metadata::active_prec(2, CharacterSet({'x'}))), - }), - PrecedenceRange(2), - false - } - }, - { - CharacterSet({'c', 'd'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Metadata::active_prec(2, CharacterSet({'x'}))), - LexItem(Symbol::non_terminal(1), Metadata::active_prec(3, CharacterSet({'y'}))), - }), - PrecedenceRange(2, 3), - false - } - }, - { - CharacterSet({'e', 'f'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Metadata::active_prec(3, CharacterSet({'y'}))), - }), - PrecedenceRange(3), - false - } - }, - }))); - }); - - it("handles choices between a subset and a superset of characters", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Rule::choice({ - Rule::seq({ - CharacterSet({'b', 'c', 'd'}), - CharacterSet({'x'}), - }), - Rule::seq({ - CharacterSet({'a', 'b', 'c', 'd', 'e', 'f'}), - CharacterSet({'y'}), - }), - })), - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'a', 'e', 'f'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), CharacterSet({'y'})), - }), - PrecedenceRange(), - false - } - }, - { - CharacterSet({'b', 'c', 'd'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), CharacterSet({'x'})), - LexItem(Symbol::non_terminal(1), CharacterSet({'y'})), - }), - PrecedenceRange(), - false - } - }, - }))); - }); - - it("handles choices between whitelisted and blacklisted character sets", [&]() { - LexItemSet item_set({ - LexItem(Symbol::non_terminal(1), Rule::seq({ - Rule::choice({ - CharacterSet().include_all().exclude('/'), - Rule::seq({ - CharacterSet({'\\'}), - CharacterSet({'/'}), - }), - }), - CharacterSet({'/'}), - })) - }); - - AssertThat( - item_set.transitions(), - Equals(LexItemSet::TransitionMap({ - { - CharacterSet().include_all().exclude('/').exclude('\\'), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), CharacterSet({'/'})), - }), - PrecedenceRange(), - false - } - }, - { - CharacterSet({'\\'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), CharacterSet({'/'})), - LexItem(Symbol::non_terminal(1), Rule::seq({CharacterSet({'/'}), CharacterSet({'/'})})), - }), - PrecedenceRange(), - false - } - }, - }))); - }); - - it("handles different items with overlapping character sets", [&]() { - LexItemSet set1({ - LexItem(Symbol::non_terminal(1), CharacterSet({'a', 'b', 'c', 'd', 'e', 'f'})), - LexItem(Symbol::non_terminal(2), CharacterSet({'e', 'f', 'g', 'h', 'i'})) - }); - - AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({ - { - CharacterSet({'a', 'b', 'c', 'd'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Blank{}), - }), - PrecedenceRange(), - false - } - }, - { - CharacterSet({'e', 'f'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(1), Blank{}), - LexItem(Symbol::non_terminal(2), Blank{}), - }), - PrecedenceRange(), - false - } - }, - { - CharacterSet({'g', 'h', 'i'}), - Transition{ - LexItemSet({ - LexItem(Symbol::non_terminal(2), Blank{}), - }), - PrecedenceRange(), - false - } - }, - }))); - }); -}); - -END_TEST diff --git a/test/compiler/build_tables/parse_item_set_builder_test.cc b/test/compiler/build_tables/parse_item_set_builder_test.cc deleted file mode 100644 index 6c41c3ca..00000000 --- a/test/compiler/build_tables/parse_item_set_builder_test.cc +++ /dev/null @@ -1,136 +0,0 @@ -#include "test_helper.h" -#include "helpers/stream_methods.h" -#include "compiler/syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/build_tables/parse_item_set_builder.h" -#include "compiler/build_tables/lookahead_set.h" - -using namespace build_tables; -using namespace rules; - -START_TEST - -describe("ParseItemSetBuilder", []() { - vector lexical_variables; - - for (size_t i = 0; i < 20; i++) { - lexical_variables.push_back({ - "token_" + to_string(i), - VariableTypeNamed, - Blank{}, - false - }); - } - - LexicalGrammar lexical_grammar{lexical_variables, {}}; - - it("adds items at the beginnings of referenced rules", [&]() { - SyntaxGrammar grammar; - grammar.variables = { - SyntaxVariable{"rule0", VariableTypeNamed, { - Production({ - {Symbol::non_terminal(1), 0, AssociativityNone, Alias{}}, - {Symbol::terminal(11), 0, AssociativityNone, Alias{}}, - }, 0), - }}, - SyntaxVariable{"rule1", VariableTypeNamed, { - Production({ - {Symbol::terminal(12), 0, AssociativityNone, Alias{}}, - {Symbol::terminal(13), 0, AssociativityNone, Alias{}}, - }, 0), - Production({ - {Symbol::non_terminal(2), 0, AssociativityNone, Alias{}}, - }, 0) - }}, - SyntaxVariable{"rule2", VariableTypeNamed, { - Production({ - {Symbol::terminal(14), 0, AssociativityNone, Alias{}}, - {Symbol::terminal(15), 0, AssociativityNone, Alias{}}, - }, 0) - }}, - }; - - auto production = [&](int variable_index, int production_index) -> const Production & { - return grammar.variables[variable_index].productions[production_index]; - }; - - ParseItemSet item_set{{ - { - ParseItem(rules::START(), production(0, 0), 0), - LookaheadSet({ Symbol::terminal(10) }), - } - }}; - - ParseItemSetBuilder item_set_builder(grammar, lexical_grammar); - item_set_builder.apply_transitive_closure(&item_set); - - AssertThat(item_set, Equals(ParseItemSet{{ - { - ParseItem(rules::START(), production(0, 0), 0), - LookaheadSet({ Symbol::terminal(10) }) - }, - { - ParseItem(Symbol::non_terminal(1), production(1, 0), 0), - LookaheadSet({ Symbol::terminal(11) }) - }, - { - ParseItem(Symbol::non_terminal(1), production(1, 1), 0), - LookaheadSet({ Symbol::terminal(11) }) - }, - { - ParseItem(Symbol::non_terminal(2), production(2, 0), 0), - LookaheadSet({ Symbol::terminal(11) }) - }, - }})); - }); - - it("handles rules with empty productions", [&]() { - SyntaxGrammar grammar; - grammar.variables = { - SyntaxVariable{"rule0", VariableTypeNamed, { - Production({ - {Symbol::non_terminal(1), 0, AssociativityNone, Alias{}}, - {Symbol::terminal(11), 0, AssociativityNone, Alias{}}, - }, 0), - }}, - SyntaxVariable{"rule1", VariableTypeNamed, { - Production({ - {Symbol::terminal(12), 0, AssociativityNone, Alias{}}, - {Symbol::terminal(13), 0, AssociativityNone, Alias{}}, - }, 0), - Production{{}, 0} - }}, - }; - - auto production = [&](int variable_index, int production_index) -> const Production & { - return grammar.variables[variable_index].productions[production_index]; - }; - - ParseItemSet item_set{{ - { - ParseItem(rules::START(), production(0, 0), 0), - LookaheadSet({ Symbol::terminal(10) }), - } - }}; - - ParseItemSetBuilder item_set_builder(grammar, lexical_grammar); - item_set_builder.apply_transitive_closure(&item_set); - - AssertThat(item_set, Equals(ParseItemSet{{ - { - ParseItem(rules::START(), production(0, 0), 0), - LookaheadSet({ Symbol::terminal(10) }) - }, - { - ParseItem(Symbol::non_terminal(1), production(1, 0), 0), - LookaheadSet({ Symbol::terminal(11) }) - }, - { - ParseItem(Symbol::non_terminal(1), production(1, 1), 0), - LookaheadSet({ Symbol::terminal(11) }) - }, - }})); - }); -}); - -END_TEST diff --git a/test/compiler/build_tables/rule_can_be_blank_test.cc b/test/compiler/build_tables/rule_can_be_blank_test.cc deleted file mode 100644 index 92dffa01..00000000 --- a/test/compiler/build_tables/rule_can_be_blank_test.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "test_helper.h" -#include "compiler/build_tables/rule_can_be_blank.h" -#include "compiler/rule.h" - -using namespace rules; -using build_tables::rule_can_be_blank; - -START_TEST - -describe("rule_can_be_blank", [&]() { - Rule rule; - - it("returns false for basic rules", [&]() { - AssertThat(rule_can_be_blank(CharacterSet{{'x'}}), IsFalse()); - }); - - it("returns true for blanks", [&]() { - AssertThat(rule_can_be_blank(Blank{}), IsTrue()); - }); - - it("returns true for repeats iff the content can be blank", [&]() { - AssertThat(rule_can_be_blank(Repeat{CharacterSet{{'x'}}}), IsFalse()); - AssertThat(rule_can_be_blank(Repeat{Blank{}}), IsTrue()); - }); - - it("returns true for choices iff one or more sides can be blank", [&]() { - rule = Rule::choice({ CharacterSet{{'x'}}, Blank{} }); - AssertThat(rule_can_be_blank(rule), IsTrue()); - - rule = Rule::choice({ Blank{}, CharacterSet{{'x'}} }); - AssertThat(rule_can_be_blank(rule), IsTrue()); - - rule = Rule::choice({ CharacterSet{{'x'}}, CharacterSet{{'y'}} }); - AssertThat(rule_can_be_blank(rule), IsFalse()); - }); - - it("returns true for sequences iff both sides can be blank", [&]() { - rule = Rule::seq({ Blank{}, CharacterSet{{'x'}} }); - AssertThat(rule_can_be_blank(rule), IsFalse()); - - rule = Rule::seq({ CharacterSet{{'x'}}, Blank{} }); - AssertThat(rule_can_be_blank(rule), IsFalse()); - - rule = Rule::seq({ Blank{}, Rule::choice({ CharacterSet{{'x'}}, Blank{} }) }); - AssertThat(rule_can_be_blank(rule), IsTrue()); - }); - - it("ignores metadata rules", [&]() { - rule = Metadata::prec(1, Blank{}); - AssertThat(rule_can_be_blank(rule), IsTrue()); - - rule = Metadata::prec(1, CharacterSet{{'x'}}); - AssertThat(rule_can_be_blank(rule), IsFalse()); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/expand_repeats_test.cc b/test/compiler/prepare_grammar/expand_repeats_test.cc deleted file mode 100644 index f7aaa8fe..00000000 --- a/test/compiler/prepare_grammar/expand_repeats_test.cc +++ /dev/null @@ -1,160 +0,0 @@ -#include "test_helper.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/prepare_grammar/expand_repeats.h" -#include "helpers/stream_methods.h" - -using namespace rules; -using prepare_grammar::InitialSyntaxGrammar; -using prepare_grammar::expand_repeats; - -START_TEST - -describe("expand_repeats", []() { - it("replaces repeat rules with pairs of recursive rules", [&]() { - InitialSyntaxGrammar grammar; - grammar.variables = { - Variable{"rule0", VariableTypeNamed, Repeat{Symbol::terminal(0)}}, - }; - - auto result = expand_repeats(grammar); - - AssertThat(result.variables, Equals(vector{ - Variable{"rule0", VariableTypeNamed, Symbol::non_terminal(1)}, - Variable{"rule0_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(1), Symbol::non_terminal(1) }), - Symbol::terminal(0), - })}, - })); - }); - - it("replaces repeats inside of sequences", [&]() { - InitialSyntaxGrammar grammar; - grammar.variables = { - Variable{"rule0", VariableTypeNamed, Rule::seq({ - Symbol::terminal(10), - Repeat{Symbol::terminal(11)}, - })}, - }; - - auto result = expand_repeats(grammar); - - AssertThat(result.variables, Equals(vector{ - Variable{"rule0", VariableTypeNamed, Rule::seq({ - Symbol::terminal(10), - Symbol::non_terminal(1), - })}, - Variable{"rule0_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(1), Symbol::non_terminal(1) }), - Symbol::terminal(11) - })}, - })); - }); - - it("replaces repeats inside of choices", [&]() { - InitialSyntaxGrammar grammar; - grammar.variables = { - Variable{"rule0", VariableTypeNamed, Rule::choice({ - Symbol::terminal(10), - Repeat{Symbol::terminal(11)} - })}, - }; - - auto result = expand_repeats(grammar); - - AssertThat(result.variables, Equals(vector{ - Variable{"rule0", VariableTypeNamed, Rule::choice({ - Symbol::terminal(10), - Symbol::non_terminal(1), - })}, - Variable{"rule0_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(1), Symbol::non_terminal(1) }), - Symbol::terminal(11), - })}, - })); - }); - - it("does not create redundant auxiliary rules", [&]() { - InitialSyntaxGrammar grammar; - grammar.variables = { - Variable{"rule0", VariableTypeNamed, Rule::choice({ - Rule::seq({ Symbol::terminal(1), Repeat{Symbol::terminal(4)} }), - Rule::seq({ Symbol::terminal(2), Repeat{Symbol::terminal(4)} }), - })}, - Variable{"rule1", VariableTypeNamed, Rule::seq({ - Symbol::terminal(3), - Repeat{Symbol::terminal(4)} - })}, - }; - - auto result = expand_repeats(grammar); - - AssertThat(result.variables, Equals(vector{ - Variable{"rule0", VariableTypeNamed, Rule::choice({ - Rule::seq({ Symbol::terminal(1), Symbol::non_terminal(2) }), - Rule::seq({ Symbol::terminal(2), Symbol::non_terminal(2) }), - })}, - Variable{"rule1", VariableTypeNamed, Rule::seq({ - Symbol::terminal(3), - Symbol::non_terminal(2), - })}, - Variable{"rule0_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(2), Symbol::non_terminal(2) }), - Symbol::terminal(4), - })}, - })); - }); - - it("can replace multiple repeats in the same rule", [&]() { - InitialSyntaxGrammar grammar; - grammar.variables = { - { - Variable{"rule0", VariableTypeNamed, Rule::seq({ - Repeat{Symbol::terminal(10)}, - Repeat{Symbol::terminal(11)}, - })}, - } - }; - - auto result = expand_repeats(grammar); - - AssertThat(result.variables, Equals(vector{ - Variable{"rule0", VariableTypeNamed, Rule::seq({ - Symbol::non_terminal(1), - Symbol::non_terminal(2), - })}, - Variable{"rule0_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(1), Symbol::non_terminal(1) }), - Symbol::terminal(10), - })}, - Variable{"rule0_repeat2", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(2), Symbol::non_terminal(2) }), - Symbol::terminal(11), - })}, - })); - }); - - it("can replace repeats in multiple rules", [&]() { - InitialSyntaxGrammar grammar; - grammar.variables = { - Variable{"rule0", VariableTypeNamed, Repeat{Symbol::terminal(10)}}, - Variable{"rule1", VariableTypeNamed, Repeat{Symbol::terminal(11)}}, - }; - - auto result = expand_repeats(grammar); - - AssertThat(result.variables, Equals(vector{ - Variable{"rule0", VariableTypeNamed, Symbol::non_terminal(2)}, - Variable{"rule1", VariableTypeNamed, Symbol::non_terminal(3)}, - Variable{"rule0_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(2), Symbol::non_terminal(2) }), - Symbol::terminal(10), - })}, - Variable{"rule1_repeat1", VariableTypeAuxiliary, Rule::choice({ - Rule::seq({ Symbol::non_terminal(3), Symbol::non_terminal(3) }), - Symbol::terminal(11), - })}, - })); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/expand_tokens_test.cc b/test/compiler/prepare_grammar/expand_tokens_test.cc deleted file mode 100644 index cb39cfa1..00000000 --- a/test/compiler/prepare_grammar/expand_tokens_test.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "test_helper.h" -#include "compiler/lexical_grammar.h" -#include "compiler/prepare_grammar/expand_tokens.h" -#include "helpers/stream_methods.h" - -START_TEST - -using namespace rules; -using prepare_grammar::expand_token; -using prepare_grammar::ExpandTokenResult; - -describe("expand_tokens", []() { - MetadataParams string_token_params; - string_token_params.is_string = true; - string_token_params.is_token = true; - - describe("string rules", [&]() { - it("replaces strings with sequences of character sets", [&]() { - AssertThat( - expand_token(Rule::seq({ - String{"a"}, - String{"bcd"}, - String{"e"} - })).rule, - Equals(Rule::seq({ - CharacterSet{{ 'a' }}, - Rule::seq({ - CharacterSet{{ 'b' }}, - CharacterSet{{ 'c' }}, - CharacterSet{{ 'd' }}, - }), - CharacterSet{{ 'e' }}, - }))); - }); - - it("handles strings containing non-ASCII UTF8 characters", [&]() { - AssertThat( - expand_token(String{"\xCE\xB1 \xCE\xB2"}).rule, - Equals(Rule::seq({ - CharacterSet{{ 945 }}, - CharacterSet{{ ' ' }}, - CharacterSet{{ 946 }}, - })) - ); - }); - }); - - describe("regexp rules", [&]() { - it("replaces regexps with the equivalent rule tree", [&]() { - AssertThat( - expand_token(Rule::seq({ - String{"a"}, - Pattern{"x+"}, - String{"b"}, - })).rule, - Equals(Rule::seq({ - CharacterSet{{'a'}}, - Repeat{CharacterSet{{ 'x' }}}, - CharacterSet{{'b'}}, - })) - ); - }); - - it("handles regexps containing non-ASCII UTF8 characters", [&]() { - AssertThat( - expand_token(Pattern{"[^\xCE\xB1-\xCE\xB4]+"}).rule, - Equals(Rule(Repeat{ - CharacterSet().include_all().exclude(945, 948) - })) - ); - }); - - it("returns an error when the grammar contains an invalid regex", [&]() { - AssertThat( - expand_token(Rule::seq({ - Pattern{"("}, - String{"xyz"}, - Pattern{"["}, - })).error, - Equals(CompileError( - TSCompileErrorTypeInvalidRegex, - "unmatched open paren" - )) - ); - }); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/extract_choices_test.cc b/test/compiler/prepare_grammar/extract_choices_test.cc deleted file mode 100644 index d1c859e7..00000000 --- a/test/compiler/prepare_grammar/extract_choices_test.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "test_helper.h" -#include "helpers/stream_methods.h" -#include "compiler/prepare_grammar/extract_choices.h" - -START_TEST - -using namespace rules; -using prepare_grammar::extract_choices; - -describe("extract_choices", []() { - it("expands rules containing choices into multiple rules", [&]() { - auto rule = Rule::seq({ - Symbol::terminal(1), - Rule::choice({ - Symbol::terminal(2), - Symbol::terminal(3), - Symbol::terminal(4) - }), - Symbol::terminal(5) - }); - - auto result = extract_choices(rule); - - AssertThat(result, Equals(vector({ - Rule::seq({Symbol::terminal(1), Symbol::terminal(2), Symbol::terminal(5)}), - Rule::seq({Symbol::terminal(1), Symbol::terminal(3), Symbol::terminal(5)}), - Rule::seq({Symbol::terminal(1), Symbol::terminal(4), Symbol::terminal(5)}), - }))); - }); - - it("handles metadata rules", [&]() { - auto rule = Metadata::prec(5, Rule::choice({ - Symbol::terminal(2), - Symbol::terminal(3), - Symbol::terminal(4) - })); - - AssertThat(extract_choices(rule), Equals(vector({ - Metadata::prec(5, Symbol::terminal(2)), - Metadata::prec(5, Symbol::terminal(3)), - Metadata::prec(5, Symbol::terminal(4)), - }))); - }); - - it("handles nested choices", [&]() { - auto rule = Rule::choice({ - Rule::seq({ - Rule::choice({ - Symbol::terminal(1), - Symbol::terminal(2) - }), - Symbol::terminal(3) - }), - Symbol::terminal(4) - }); - - AssertThat(extract_choices(rule), Equals(vector({ - Rule::seq({Symbol::terminal(1), Symbol::terminal(3)}), - Rule::seq({Symbol::terminal(2), Symbol::terminal(3)}), - Symbol::terminal(4), - }))); - }); - - it("handles single symbols", [&]() { - AssertThat(extract_choices(Symbol::terminal(2)), Equals(vector({ - Symbol::terminal(2) - }))); - }); - - it("handles blank rules", [&]() { - AssertThat(extract_choices(Blank{}), Equals(vector({ - Blank{}, - }))); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/extract_tokens_test.cc b/test/compiler/prepare_grammar/extract_tokens_test.cc deleted file mode 100644 index 5ea6e469..00000000 --- a/test/compiler/prepare_grammar/extract_tokens_test.cc +++ /dev/null @@ -1,459 +0,0 @@ -#include "test_helper.h" -#include "compiler/lexical_grammar.h" -#include "compiler/prepare_grammar/interned_grammar.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/prepare_grammar/extract_tokens.h" -#include "helpers/stream_methods.h" - -START_TEST - -using namespace rules; -using prepare_grammar::extract_tokens; -using prepare_grammar::InternedGrammar; -using prepare_grammar::InitialSyntaxGrammar; - -describe("extract_tokens", []() { - it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{ - "rule_A", - VariableTypeNamed, - Repeat{Rule::seq({ - String{"ab"}, - Pattern{"cd+"}, - Rule::choice({ - Symbol::non_terminal(1), - Symbol::non_terminal(2), - Metadata::token(Repeat{Rule::choice({ - String{"ef"}, - String{"g"} - })}), - }), - })} - }, - Variable{ - "rule_B", - VariableTypeNamed, - Pattern{"h+"} - }, - Variable{ - "rule_C", - VariableTypeNamed, - Rule::choice({ String{"i"}, Blank{} }) - }, - Variable{ - "rule_D", - VariableTypeNamed, - Repeat{Symbol::non_terminal(3)} - }, - }, - {}, {}, {}, {}, {} - }); - - InitialSyntaxGrammar &syntax_grammar = get<0>(result); - LexicalGrammar &lexical_grammar = get<1>(result); - CompileError error = get<2>(result); - - AssertThat(error, Equals(CompileError::none())); - - AssertThat(syntax_grammar.variables, Equals(vector{ - Variable{ - "rule_A", - VariableTypeNamed, - Repeat{Rule::seq({ - - // This string is now the first token in the lexical grammar. - Symbol::terminal(0), - - // This pattern is now the second rule in the lexical grammar. - Symbol::terminal(1), - - Rule::choice({ - // Rule 1, which this symbol pointed to, has been moved to the - // lexical grammar. - Symbol::terminal(3), - - // This symbol's index has been decremented, because a previous rule - // was moved to the lexical grammar. - Symbol::non_terminal(1), - - // This token rule is now the third rule in the lexical grammar. - Symbol::terminal(2), - }), - })} - }, - - Variable{ - "rule_C", - VariableTypeNamed, - Rule::choice({Symbol::terminal(4), Blank{}}) - }, - - Variable{ - "rule_D", - VariableTypeNamed, - Repeat{Symbol::non_terminal(2)} - }, - })); - - AssertThat(lexical_grammar.variables, Equals(vector({ - // Strings become anonymous rules. - LexicalVariable{ - "ab", - VariableTypeAnonymous, - Seq{CharacterSet{{'a'}}, CharacterSet{{'b'}}}, - true - }, - - // Patterns become hidden rules. - LexicalVariable{ - "/cd+/", - VariableTypeAuxiliary, - Seq{CharacterSet{{'c'}}, Repeat{CharacterSet{{'d'}}}}, - false - }, - - // Rules marked as tokens become hidden rules. - LexicalVariable{ - "/(ef|g)+/", - VariableTypeAuxiliary, - Repeat{Rule::choice({ - Seq{CharacterSet{{'e'}}, CharacterSet{{'f'}}}, - CharacterSet{{'g'}}, - })}, - false - }, - - // This named rule was moved wholesale to the lexical grammar. - LexicalVariable{ - "rule_B", - VariableTypeNamed, - Repeat{CharacterSet{{'h'}}}, - false - }, - - // Strings become anonymous rules. - LexicalVariable{ - "i", - VariableTypeAnonymous, - CharacterSet{{'i'}}, - true - }, - }))); - }); - - it("does not create duplicate tokens in the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - { - "rule_A", - VariableTypeNamed, - Rule::seq({ - String{"ab"}, - Symbol::non_terminal(1), - String{"ab"}, - }) - }, - }, - {}, {}, {}, {}, {} - }); - - InitialSyntaxGrammar &syntax_grammar = get<0>(result); - LexicalGrammar &lexical_grammar = get<1>(result); - - AssertThat(syntax_grammar.variables, Equals(vector { - Variable{ - "rule_A", - VariableTypeNamed, - Rule::seq({ - Symbol::terminal(0), - Symbol::non_terminal(1), - Symbol::terminal(0) - }) - }, - })); - - AssertThat(lexical_grammar.variables, Equals(vector { - LexicalVariable{ - "ab", - VariableTypeAnonymous, - Seq{CharacterSet{{'a'}}, CharacterSet{{'b'}}}, - true - }, - })); - }); - - it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{ - "rule_A", - VariableTypeNamed, - Rule::seq({ Symbol::non_terminal(1), String{"ab"} }) - }, - Variable{ - "rule_B", - VariableTypeNamed, - String{"cd"} - }, - Variable{ - "rule_C", - VariableTypeNamed, - Rule::seq({ String{"ef"}, String{"cd"} }) - }, - }, - {}, {}, {}, {}, {} - }); - - InitialSyntaxGrammar &syntax_grammar = get<0>(result); - LexicalGrammar &lexical_grammar = get<1>(result); - - AssertThat(syntax_grammar.variables, Equals(vector({ - Variable{ - "rule_A", - VariableTypeNamed, - Rule::seq({ Symbol::non_terminal(1), Symbol::terminal(0) }) - }, - Variable{ - "rule_B", - VariableTypeNamed, - Symbol::terminal(1) - }, - Variable{ - "rule_C", - VariableTypeNamed, - Rule::seq({ Symbol::terminal(2), Symbol::terminal(1) }) - }, - }))); - - AssertThat(lexical_grammar.variables, Equals(vector { - LexicalVariable{ - "ab", - VariableTypeAnonymous, - Seq{CharacterSet{{'a'}}, CharacterSet{{'b'}}}, - true - }, - LexicalVariable{ - "cd", - VariableTypeAnonymous, - Seq{CharacterSet{{'c'}}, CharacterSet{{'d'}}}, - true - }, - LexicalVariable{ - "ef", - VariableTypeAnonymous, - Seq{CharacterSet{{'e'}}, CharacterSet{{'f'}}}, - true - }, - })); - }); - - it("does not move the start rule into the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{ - "rule_a", - VariableTypeNamed, - String{"a"} - }, - }, - {}, {}, {}, {}, {} - }); - - InitialSyntaxGrammar &syntax_grammar = get<0>(result); - LexicalGrammar &lexical_grammar = get<1>(result); - - AssertThat(syntax_grammar.variables.size(), Equals(1u)); - AssertThat(lexical_grammar.variables.size(), Equals(1u)); - }); - - it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{ - "rule_a", - VariableTypeNamed, - Symbol::non_terminal(2) - }, - Variable{ - "rule_b", - VariableTypeNamed, - String{"ok"} - }, - Variable{ - "rule_c", - VariableTypeNamed, - Repeat{Symbol::non_terminal(1)} - }, - Variable{ - "rule_d", - VariableTypeNamed, - Repeat{Seq{Symbol::non_terminal(1), Symbol::non_terminal(1)}} - }, - }, - { - String{" "} - }, - { - { Symbol::non_terminal(2), Symbol::non_terminal(3) } - }, - {}, {}, {} - }); - - InitialSyntaxGrammar &syntax_grammar = get<0>(result); - - AssertThat(syntax_grammar.variables.size(), Equals(3)); - AssertThat(syntax_grammar.expected_conflicts, Equals(set>({ - { Symbol::non_terminal(1), Symbol::non_terminal(2) }, - }))); - }); - - describe("handling extra tokens", [&]() { - it("adds inline extra tokens to the lexical grammar's separators", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{"rule_A", VariableTypeNamed, String{"x"}}, - }, - { - String{"y"}, - Pattern{" "}, - }, - {}, {}, {}, {} - }); - - AssertThat(get<2>(result), Equals(CompileError::none())); - - AssertThat(get<1>(result).separators.size(), Equals(2)); - AssertThat(get<1>(result).separators[0], Equals(Rule(CharacterSet{{'y'}}))); - AssertThat(get<1>(result).separators[1], Equals(Rule(CharacterSet{{' '}}))); - - AssertThat(get<0>(result).extra_tokens, IsEmpty()); - }); - - it("handles inline extra tokens that match tokens in the grammar", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{"rule_A", VariableTypeNamed, String{"x"}}, - Variable{"rule_B", VariableTypeNamed, String{"y"}}, - }, - { - String{"y"}, - }, - {}, {}, {}, {} - }); - - AssertThat(get<2>(result), Equals(CompileError::none())); - AssertThat(get<1>(result).separators.size(), Equals(0)); - AssertThat(get<0>(result).extra_tokens, Equals(set({ Symbol::terminal(1) }))); - }); - - it("updates extra symbols according to the new symbol numbers", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{ - "rule_A", - VariableTypeNamed, - Rule::seq({ String{"w"}, String{"x"}, Symbol::non_terminal(1) }) - }, - Variable{ - "rule_B", - VariableTypeNamed, - String{"y"} - }, - Variable{ - "rule_C", - VariableTypeNamed, - String{"z"} - }, - }, - { - Symbol::non_terminal(2), - }, - {}, {}, {}, {} - }); - - AssertThat(get<2>(result), Equals(CompileError::none())); - - AssertThat(get<0>(result).extra_tokens, Equals(set({ - { Symbol::terminal(3) }, - }))); - - AssertThat(get<1>(result).separators, IsEmpty()); - }); - - it("returns an error if any extra tokens are non-token symbols", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - Variable{ - "rule_A", - VariableTypeNamed, - Rule::seq({ String{"x"}, Symbol::non_terminal(1) }) - }, - Variable{ - "rule_B", - VariableTypeNamed, - Rule::seq({ String{"y"}, String{"z"} }) - }, - }, - { - Symbol::non_terminal(1) - }, - {}, {}, {}, {} - }); - - AssertThat(get<2>(result), Equals(CompileError( - TSCompileErrorTypeInvalidExtraToken, - "Non-token symbol rule_B can't be used as an extra token" - ))); - }); - - it("returns an error if any extra tokens are non-token rules", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - {"rule_A", VariableTypeNamed, String{"x"}}, - {"rule_B", VariableTypeNamed, String{"y"}}, - }, - { - Rule::choice({ Symbol::non_terminal(1), Blank{} }) - }, - {}, {}, {}, {} - }); - - AssertThat(get<2>(result), Equals(CompileError( - TSCompileErrorTypeInvalidExtraToken, - "Non-token rule expression can't be used as an extra token" - ))); - }); - }); - - it("returns an error if an external token has the same name as a non-terminal rule", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - { - "rule_A", - VariableTypeNamed, - Rule::seq({ String{"x"}, Symbol::non_terminal(1) }) - }, - { - "rule_B", - VariableTypeNamed, - Rule::seq({ String{"y"}, String{"z"} }) - }, - }, - {}, - {}, - { - Variable{"rule_A", VariableTypeNamed, Symbol::non_terminal(0)} - }, - {}, {} - }); - - AssertThat(get<2>(result), Equals(CompileError( - TSCompileErrorTypeInvalidExternalToken, - "Name 'rule_A' cannot be used for both an external token and a non-terminal rule" - ))); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/flatten_grammar_test.cc b/test/compiler/prepare_grammar/flatten_grammar_test.cc deleted file mode 100644 index 1a42df2b..00000000 --- a/test/compiler/prepare_grammar/flatten_grammar_test.cc +++ /dev/null @@ -1,130 +0,0 @@ -#include "test_helper.h" -#include "compiler/prepare_grammar/flatten_grammar.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/syntax_grammar.h" -#include "helpers/stream_methods.h" - -START_TEST - -using namespace rules; -using prepare_grammar::flatten_rule; - -describe("flatten_grammar", []() { - it("associates each symbol with the precedence and associativity binding it to its successor", [&]() { - SyntaxVariable result = flatten_rule({ - "test", - VariableTypeNamed, - Rule::seq({ - Symbol::non_terminal(1), - Metadata::prec_left(101, Rule::seq({ - Symbol::non_terminal(2), - Rule::choice({ - Metadata::prec_right(102, Rule::seq({ - Symbol::non_terminal(3), - Symbol::non_terminal(4) - })), - Symbol::non_terminal(5), - }), - Symbol::non_terminal(6), - })), - Symbol::non_terminal(7), - }) - }); - - AssertThat(result.name, Equals("test")); - AssertThat(result.type, Equals(VariableTypeNamed)); - AssertThat(result.productions, Equals(vector({ - Production({ - {Symbol::non_terminal(1), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(2), 101, AssociativityLeft, Alias{}}, - {Symbol::non_terminal(3), 102, AssociativityRight, Alias{}}, - {Symbol::non_terminal(4), 101, AssociativityLeft, Alias{}}, - {Symbol::non_terminal(6), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(7), 0, AssociativityNone, Alias{}}, - }, 0), - Production({ - {Symbol::non_terminal(1), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(2), 101, AssociativityLeft, Alias{}}, - {Symbol::non_terminal(5), 101, AssociativityLeft, Alias{}}, - {Symbol::non_terminal(6), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(7), 0, AssociativityNone, Alias{}}, - }, 0) - }))); - }); - - it("stores the maximum dynamic precedence specified in each production", [&]() { - SyntaxVariable result = flatten_rule({ - "test", - VariableTypeNamed, - Rule::seq({ - Symbol::non_terminal(1), - Metadata::prec_dynamic(101, Rule::seq({ - Symbol::non_terminal(2), - Rule::choice({ - Metadata::prec_dynamic(102, Rule::seq({ - Symbol::non_terminal(3), - Symbol::non_terminal(4) - })), - Symbol::non_terminal(5), - }), - Symbol::non_terminal(6), - })), - Symbol::non_terminal(7), - }) - }); - - AssertThat(result.name, Equals("test")); - AssertThat(result.type, Equals(VariableTypeNamed)); - AssertThat(result.productions, Equals(vector({ - Production({ - {Symbol::non_terminal(1), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(2), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(3), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(4), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(6), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(7), 0, AssociativityNone, Alias{}}, - }, 102), - Production({ - {Symbol::non_terminal(1), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(2), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(5), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(6), 0, AssociativityNone, Alias{}}, - {Symbol::non_terminal(7), 0, AssociativityNone, Alias{}}, - }, 101), - }))); - }); - - it("uses the last assigned precedence", [&]() { - SyntaxVariable result = flatten_rule({ - "test1", - VariableTypeNamed, - Metadata::prec_left(101, Rule::seq({ - Symbol::non_terminal(1), - Symbol::non_terminal(2), - })) - }); - - AssertThat(result.productions, Equals(vector({ - Production({ - {Symbol::non_terminal(1), 101, AssociativityLeft, Alias{}}, - {Symbol::non_terminal(2), 101, AssociativityLeft, Alias{}}, - }, 0) - }))); - - result = flatten_rule({ - "test2", - VariableTypeNamed, - Metadata::prec_left(101, Rule::seq({ - Symbol::non_terminal(1), - })) - }); - - AssertThat(result.productions, Equals(vector({ - Production({ - {Symbol::non_terminal(1), 101, AssociativityLeft, Alias{}}, - }, 0) - }))); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/intern_symbols_test.cc b/test/compiler/prepare_grammar/intern_symbols_test.cc deleted file mode 100644 index 6816dad4..00000000 --- a/test/compiler/prepare_grammar/intern_symbols_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include "test_helper.h" -#include "compiler/prepare_grammar/intern_symbols.h" -#include "compiler/grammar.h" -#include "compiler/rule.h" -#include "helpers/stream_methods.h" - -START_TEST - -using namespace rules; -using prepare_grammar::intern_symbols; - -describe("intern_symbols", []() { - it("replaces named symbols with numerically-indexed symbols", [&]() { - InputGrammar grammar; - grammar.variables = { - {"x", VariableTypeNamed, Rule::choice({ NamedSymbol{"y"}, NamedSymbol{"_z"} })}, - {"y", VariableTypeNamed, NamedSymbol{"_z"}}, - {"_z", VariableTypeNamed, String{"stuff"}} - }; - - auto result = intern_symbols(grammar); - - AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector{ - {"x", VariableTypeNamed, Rule::choice({ Symbol::non_terminal(1), Symbol::non_terminal(2) })}, - {"y", VariableTypeNamed, Symbol::non_terminal(2)}, - {"_z", VariableTypeHidden, String{"stuff"}}, - })); - }); - - describe("when there are symbols that reference undefined rules", [&]() { - it("returns an error", []() { - InputGrammar grammar; - grammar.variables = { - {"x", VariableTypeNamed, NamedSymbol{"y"}}, - }; - - auto result = intern_symbols(grammar); - - AssertThat(result.second.message, Equals("Undefined rule 'y'")); - }); - }); - - it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() { - InputGrammar grammar; - grammar.variables = { - {"x", VariableTypeNamed, Rule::choice({ NamedSymbol{"y"}, NamedSymbol{"z"} })}, - {"y", VariableTypeNamed, NamedSymbol{"z"}}, - {"z", VariableTypeNamed, String{"stuff"}} - }; - grammar.extra_tokens = { - NamedSymbol{"z"} - }; - - auto result = intern_symbols(grammar); - - AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.extra_tokens.size(), Equals(1)); - AssertThat(result.first.extra_tokens, Equals(vector({ Symbol::non_terminal(2) }))); - }); - - it("records any rule names that match external token names", [&]() { - InputGrammar grammar; - grammar.variables = { - {"x", VariableTypeNamed, Rule::choice({ NamedSymbol{"y"}, NamedSymbol{"z"} })}, - {"y", VariableTypeNamed, NamedSymbol{"z"}}, - {"z", VariableTypeNamed, String{"stuff"}}, - }; - grammar.external_tokens = { - NamedSymbol{"w"}, - NamedSymbol{"z"}, - }; - - auto result = intern_symbols(grammar); - - AssertThat(result.first.external_tokens, Equals(vector{ - Variable{ - "w", - VariableTypeNamed, - Symbol::external(0), - }, - Variable{ - "z", - VariableTypeNamed, - Symbol::non_terminal(2), - }, - })); - }); -}); - -END_TEST diff --git a/test/compiler/prepare_grammar/parse_regex_test.cc b/test/compiler/prepare_grammar/parse_regex_test.cc deleted file mode 100644 index 27fd43b1..00000000 --- a/test/compiler/prepare_grammar/parse_regex_test.cc +++ /dev/null @@ -1,412 +0,0 @@ -#include "test_helper.h" -#include "compiler/prepare_grammar/parse_regex.h" - -START_TEST - -using namespace rules; -using prepare_grammar::parse_regex; - -describe("parse_regex", []() { - struct ValidInputRow { - string description; - string pattern; - Rule rule; - }; - - vector valid_inputs = { - { - "character sets", - "[aAeE]", - CharacterSet{{ 'a', 'A', 'e', 'E' }} - }, - - { - "'.' characters as wildcards", - ".", - CharacterSet().include_all().exclude('\n') - }, - - { - "character classes", - "\\w-\\d-\\s-\\W-\\D-\\S", - Rule::seq({ - CharacterSet{{ - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', - 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', - 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_' }}, - CharacterSet{{ '-' }}, - CharacterSet{{ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }}, - CharacterSet{{ '-' }}, - CharacterSet{{ ' ', '\t', '\r', '\n' }}, - CharacterSet{{ '-' }}, - CharacterSet().include_all() - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_'), - CharacterSet{{ '-' }}, - CharacterSet().include_all().exclude('0', '9'), - CharacterSet{{ '-' }}, - CharacterSet().include_all() - .exclude(' ') - .exclude('\t') - .exclude('\r') - .exclude('\n') - }) - }, - - { - "choices", - "ab|cd|ef", - Rule::choice({ - Seq{ - CharacterSet{{'a'}}, - CharacterSet{{'b'}} - }, - Seq{ - CharacterSet{{'c'}}, - CharacterSet{{'d'}} - }, - Seq{ - CharacterSet{{'e'}}, - CharacterSet{{'f'}} - } - }) - }, - - { - "simple sequences", - "abc", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'b'}}, - CharacterSet{{'c'}} - }) - }, - - { - "character ranges", - "[12a-dA-D3]", - CharacterSet{{ - '1', '2', '3', - 'a', 'b', 'c', 'd', - 'A', 'B', 'C', 'D' - }} - }, - - { - "negated characters", - "[^a\\d]", - CharacterSet().include_all() - .exclude('a') - .exclude('0', '9') - }, - - { - "backslashes", - "\\\\", - CharacterSet{{'\\'}} - }, - - { - "dashes", - "a-b", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'-'}}, - CharacterSet{{'b'}} - }) - }, - - { - "literal dashes in character classes", - "[a-][\\d-a][\\S-a]", - Rule::seq({ - CharacterSet{{'a', '-'}}, - CharacterSet().include('0', '9').include('-').include('a'), - CharacterSet().include_all() - .exclude(' ') - .exclude('\t') - .exclude('\r') - .exclude('\n') - }) - }, - - { - "character groups in sequences", - "x([^x]|\\\\x)*x", - Rule::seq({ - CharacterSet{{'x'}}, - Rule::choice({ - Repeat{Rule::choice({ - CharacterSet().include_all().exclude('x'), - Rule::seq({ - CharacterSet{{'\\'}}, - CharacterSet{{'x'}} - }) - })}, - Blank{} - }), - CharacterSet{{'x'}} - }) - }, - - { - "choices in sequences", - "(a|b)cd", - Rule::seq({ - Rule::choice({ - CharacterSet{{'a'}}, - CharacterSet{{'b'}} }), - CharacterSet{{'c'}}, - CharacterSet{{'d'}} }) - }, - - { - "escaped parentheses", - "a\\(b", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'('}}, - CharacterSet{{'b'}}, - }) - }, - - { - "escaped brackets", - "\\[\\]", - Rule::seq({ - CharacterSet{{'['}}, - CharacterSet{{']'}}, - }) - }, - - { - "escaped brackets in choice", - "[\\[\\]]", - CharacterSet{{'[', ']'}} - }, - - { - "escaped brackets in range", - "[\\[-\\]]", - CharacterSet{{'[', '\\', ']'}} - }, - - { - "escaped characters in ranges", - "[\\0-\\n]", - CharacterSet().include(0, '\n') - }, - - { - "escaped periods", - "a\\.", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'.'}}, - }) - }, - - { - "escaped characters", - "\\t\\n\\r", - Rule::seq({ - CharacterSet{{'\t'}}, - CharacterSet{{'\n'}}, - CharacterSet{{'\r'}}, - }) - }, - - { - "plus repeats", - "(ab)+(cd)+", - Rule::seq({ - Repeat{Rule::seq({ CharacterSet{{'a'}}, CharacterSet{{'b'}} })}, - Repeat{Rule::seq({ CharacterSet{{'c'}}, CharacterSet{{'d'}} })}, - }) - }, - - { - "asterix repeats", - "(ab)*(cd)*", - Rule::seq({ - Rule::choice({ - Repeat{Rule::seq({ CharacterSet{{'a'}}, CharacterSet{{'b'}} })}, - Blank{}, - }), - Rule::choice({ - Repeat{Rule::seq({ CharacterSet{{'c'}}, CharacterSet{{'d'}} })}, - Blank{}, - }), - }) - }, - - { - "optional rules", - "a(bc)?", - Rule::seq({ - CharacterSet{{'a'}}, - Rule::choice({ - Rule::seq({ - CharacterSet{{'b'}}, - CharacterSet{{'c'}}, - }), - Blank{} - }), - }) - }, - - { - "choices containing negated character classes", - "/([^/]|(\\\\/))+/", - Rule::seq({ - CharacterSet{{'/'}}, - Repeat{Rule::choice({ - CharacterSet().include_all().exclude('/'), - Rule::seq({ - CharacterSet{{'\\'}}, - CharacterSet{{'/'}}, - }), - })}, - CharacterSet{{'/'}}, - }), - }, - - { - "characters with quantifiers", - "a{3}", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'a'}}, - CharacterSet{{'a'}}, - }), - }, - - { - "character classes with quantifiers", - "[a-f]{3}", - Rule::seq({ - CharacterSet().include('a', 'f'), - CharacterSet().include('a', 'f'), - CharacterSet().include('a', 'f'), - }), - }, - - { - "characters with open range quantifiers", - "a{,} b{1,} c{,2}", - Rule::seq({ - Rule::seq({ - Repeat{CharacterSet{{'a'}}}, - }), - CharacterSet{{' '}}, - Rule::seq({ - CharacterSet{{'b'}}, - Rule::choice({ - Repeat{CharacterSet{{'b'}}}, - Blank{} - }), - }), - CharacterSet{{' '}}, - Rule::seq({ - Rule::choice({CharacterSet{{'c'}}, Blank{}}), - Rule::choice({CharacterSet{{'c'}}, Blank{}}), - }), - }), - }, - - { - "characters with closed range quantifiers", - "a{2,4}", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'a'}}, - Rule::choice({CharacterSet{{'a'}}, Blank{}}), - Rule::choice({CharacterSet{{'a'}}, Blank{}}), - }), - }, - - { - "curly braces that aren't quantifiers", - "a{1b} c{2,d}", - Rule::seq({ - CharacterSet{{'a'}}, - CharacterSet{{'{'}}, - CharacterSet{{'1'}}, - CharacterSet{{'b'}}, - CharacterSet{{'}'}}, - CharacterSet{{' '}}, - CharacterSet{{'c'}}, - CharacterSet{{'{'}}, - CharacterSet{{'2'}}, - CharacterSet{{','}}, - CharacterSet{{'d'}}, - CharacterSet{{'}'}}, - }), - } - }; - - struct InvalidInputRow { - string description; - string pattern; - const char *message; - }; - - vector invalid_inputs = { - { - "mismatched open parens", - "(a", - "unmatched open paren", - }, - { - "mismatched nested open parens", - "((a) (b)", - "unmatched open paren", - }, - { - "mismatched close parens", - "a)", - "unmatched close paren", - }, - { - "mismatched nested close parens", - "((a) b))", - "unmatched close paren", - }, - { - "mismatched brackets for character classes", - "[a", - "unmatched open square bracket", - }, - { - "mismatched brackets for character classes", - "a]", - "unmatched close square bracket", - }, - { - "numbers out of order in range quantifiers", - "a{3,1}", - "numbers out of order in {} quantifier", - }, - }; - - for (auto &row : valid_inputs) { - it(("parses " + row.description).c_str(), [&]() { - auto result = parse_regex(row.pattern); - AssertThat(result.first, Equals(row.rule)); - }); - } - - for (auto &row : invalid_inputs) { - it(("handles invalid regexes with " + row.description).c_str(), [&]() { - auto result = parse_regex(row.pattern); - AssertThat(result.second.type, Equals(TSCompileErrorTypeInvalidRegex)); - AssertThat(result.second.message, Contains(row.message)); - }); - } -}); - -END_TEST diff --git a/test/compiler/rules/character_set_test.cc b/test/compiler/rules/character_set_test.cc deleted file mode 100644 index dfe67604..00000000 --- a/test/compiler/rules/character_set_test.cc +++ /dev/null @@ -1,325 +0,0 @@ -#include "test_helper.h" -#include "compiler/rule.h" - -using namespace rules; - -START_TEST - -describe("CharacterSet", []() { - describe("equality", [&]() { - it("returns true for identical character sets", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'd') - .include('f', 'm'); - - CharacterSet set2 = CharacterSet() - .include('a', 'd') - .include('f', 'm'); - - AssertThat(set1, Equals(set2)); - }); - - it("returns false for character sets that include different ranges", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'd') - .include('f', 'm'); - - CharacterSet set2 = CharacterSet() - .include('a', 'c') - .include('f', 'm'); - - AssertThat(set1, !Equals(set2)); - AssertThat(set2, !Equals(set1)); - }); - - it("returns false for character sets that exclude different ranges", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'd') - .exclude('f', 'm'); - - CharacterSet set2 = CharacterSet() - .include_all() - .exclude('a', 'c') - .exclude('f', 'm'); - - AssertThat(set1, !Equals(set2)); - AssertThat(set2, !Equals(set1)); - }); - - it("returns false for character sets with different sign", [&]() { - CharacterSet set1 = CharacterSet().include_all(); - CharacterSet set2 = CharacterSet(); - - AssertThat(set1, !Equals(set2)); - AssertThat(set2, !Equals(set1)); - }); - }); - - describe("hashing", [&]() { - it("returns the same number for identical character sets", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'd') - .include('f', 'm'); - - CharacterSet set2 = CharacterSet() - .include('a', 'd') - .include('f', 'm'); - - AssertThat(hash()(set1), Equals(hash()(set2))); - }); - - it("returns different numbers for character sets that include different ranges", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'd') - .include('f', 'm'); - - CharacterSet set2 = CharacterSet() - .include('a', 'c') - .include('f', 'm'); - - AssertThat(hash()(set1), !Equals(hash()(set2))); - AssertThat(hash()(set2), !Equals(hash()(set1))); - }); - - it("returns different numbers for character sets that exclude different ranges", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'd') - .exclude('f', 'm'); - - CharacterSet set2 = CharacterSet() - .include_all() - .exclude('a', 'c') - .exclude('f', 'm'); - - AssertThat(hash()(set1), !Equals(hash()(set2))); - AssertThat(hash()(set2), !Equals(hash()(set1))); - }); - - it("returns different numbers for character sets with different sign", [&]() { - CharacterSet set1 = CharacterSet().include_all(); - CharacterSet set2 = CharacterSet(); - - AssertThat(hash()(set1), !Equals(hash()(set2))); - AssertThat(hash()(set2), !Equals(hash()(set1))); - }); - }); - - describe("::is_empty", [&]() { - it("returns true for empty character sets", [&]() { - AssertThat(CharacterSet().is_empty(), Equals(true)); - }); - - it("returns false for full character sets", [&]() { - AssertThat(CharacterSet().include_all().is_empty(), Equals(false)); - }); - - it("returns false for character sets that include some characters", [&]() { - AssertThat(CharacterSet().include('x').is_empty(), Equals(false)); - }); - }); - - describe("::include", [&]() { - describe("when the set has a whitelist of characters", [&]() { - it("adds included characters", [&]() { - CharacterSet set1 = CharacterSet().include('a', 'd'); - AssertThat(set1, Equals(CharacterSet() - .include('a') - .include('b') - .include('c') - .include('d'))); - }); - }); - - describe("when the set has a blacklist of characters", [&]() { - it("removes excluded characters", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'g') - .include('c', 'e'); - - AssertThat(set1, Equals(CharacterSet() - .include_all() - .exclude('a') - .exclude('b') - .exclude('f') - .exclude('g'))); - }); - - it("does nothing if the character are already not excluded", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .include('a', 'c'); - - AssertThat(set1, Equals(CharacterSet().include_all())); - }); - }); - }); - - describe("::exclude", [&]() { - describe("when the set has a whitelist of characters", [&]() { - it("removes included characters", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'g') - .exclude('c', 'e'); - - AssertThat(set1, Equals(CharacterSet() - .include('a') - .include('b') - .include('f') - .include('g'))); - }); - - it("does nothing if the character's are already not included", [&]() { - CharacterSet set1 = CharacterSet().exclude('a', 'c'); - AssertThat(set1, Equals(CharacterSet())); - }); - }); - - describe("when the set has a blacklist of characters", [&]() { - it("removes excluded characters", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'd'); - - AssertThat(set1, Equals(CharacterSet() - .include_all() - .exclude('a') - .exclude('b') - .exclude('c') - .exclude('d'))); - }); - }); - }); - - describe("::remove_set", []() { - CharacterSet intersection; - - describe("for a set with whitelisted characters", [&]() { - describe("when the subtractend has whitelisted characters", [&]() { - it("removes the included characters that the other set also includes", [&]() { - CharacterSet set1 = CharacterSet().include('a', 'z'); - set1.remove_set(CharacterSet().include('d', 's')); - AssertThat(set1, Equals(CharacterSet() - .include('a', 'c') - .include('t', 'z'))); - }); - - it("returns the characters that were removed", [&]() { - CharacterSet set1 = CharacterSet().include('a', 'z'); - intersection = set1.remove_set(CharacterSet().include('d', 's')); - AssertThat(intersection, Equals(CharacterSet() - .include('d', 's'))); - }); - - it("returns the empty set when the sets are disjoint", [&]() { - CharacterSet set1 = CharacterSet().include('a', 'z'); - intersection = set1.remove_set(CharacterSet().include('A', 'Z')); - AssertThat(set1, Equals(CharacterSet().include('a', 'z'))); - AssertThat(intersection, Equals(CharacterSet())); - }); - }); - - describe("when the subtractend has blacklisted characters", [&]() { - it("removes the included characters that are not excluded by the other set", [&]() { - CharacterSet set1 = CharacterSet().include('a', 'f'); - - intersection = set1.remove_set(CharacterSet() - .include_all() - .exclude('d', 'z')); - - AssertThat(set1, Equals(CharacterSet() - .include('d', 'f'))); - AssertThat(intersection, Equals(CharacterSet() - .include('a', 'c'))); - }); - }); - }); - - describe("for a set with blacklisted characters", [&]() { - describe("when the subtractend has whitelisted characters", [&]() { - it("adds the subtractend's inclusions to the receiver's exclusions", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'f'); - - intersection = set1.remove_set(CharacterSet() - .include('x', 'z')); - - AssertThat(set1, Equals(CharacterSet() - .include_all() - .exclude('a', 'f') - .exclude('x', 'z'))); - - AssertThat(intersection, Equals(CharacterSet().include('x', 'z'))); - }); - }); - - describe("when the subtractend has blacklisted characters", [&]() { - it("includes only the characters excluded by the subtractend but not by the receiver", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'm'); - - set1.remove_set(CharacterSet() - .include_all() - .exclude('d', 'z')); - - AssertThat(set1, Equals(CharacterSet() - .include('n', 'z'))); - }); - - it("returns the characters excluded by neither set", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'm'); - - intersection = set1.remove_set(CharacterSet() - .include_all() - .exclude('d', 'z')); - - AssertThat(intersection, Equals(CharacterSet() - .include_all() - .exclude('a', 'z'))); - }); - - it("works when the sets are disjoint", [&]() { - CharacterSet set1 = CharacterSet() - .include_all() - .exclude('a', 'm'); - - intersection = set1.remove_set(CharacterSet() - .include_all() - .exclude('d', 'z')); - - AssertThat(set1, Equals(CharacterSet() - .include('n', 'z'))); - - AssertThat(intersection, Equals(CharacterSet() - .include_all() - .exclude('a', 'z'))); - }); - }); - }); - }); - - describe("::included_ranges", [&]() { - it("consolidates consecutive sequences of characters into ranges", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'c') - .include('e', 'j') - .include('m') - .include('z'); - - AssertThat(set1.included_ranges(), Equals(vector({ - CharacterRange{'a', 'c'}, - CharacterRange{'e', 'j'}, - CharacterRange('m'), - CharacterRange('z'), - }))); - }); - }); -}); - -END_TEST diff --git a/test/compiler/rules/rule_test.cc b/test/compiler/rules/rule_test.cc deleted file mode 100644 index 389cb1dd..00000000 --- a/test/compiler/rules/rule_test.cc +++ /dev/null @@ -1,90 +0,0 @@ -#include "test_helper.h" -#include "compiler/rule.h" - -using namespace rules; - -START_TEST - -describe("Repeat", []() { - describe("constructing repeats", [&]() { - it("doesn't create redundant repeats", [&]() { - Rule symbol = Symbol::non_terminal(1); - Rule repeat = Rule::repeat(Rule(symbol)); - Rule outer_repeat = Rule::repeat(Rule(repeat)); - - AssertThat(repeat, !Equals(symbol)); - AssertThat(outer_repeat, Equals(repeat)); - }); - }); - - describe("adding metadata to rules", [&]() { - it("doesn't create redundant metadata rules", [&]() { - Rule symbol = Symbol::non_terminal(1); - Rule outer_rule = Metadata::prec(2, Metadata::prec(1, Rule(symbol))); - AssertThat(outer_rule, Equals(Rule(Metadata::prec(1, Rule(symbol))))); - }); - }); - - describe("constructing choices", [&]() { - it("eliminates duplicate members", [&]() { - Rule rule = Rule::choice({ - Rule::seq({ NamedSymbol{"one"}, NamedSymbol{"two"} }), - NamedSymbol{"three"}, - Rule::seq({ NamedSymbol{"one"}, NamedSymbol{"two"} }) - }); - - AssertThat(rule, Equals(Rule(Choice{{ - Rule::seq({ NamedSymbol{"one"}, NamedSymbol{"two"} }), - NamedSymbol{"three"}, - }}))); - - rule = Rule::choice({ - Blank{}, - Blank{}, - Rule::choice({ - Blank{}, - NamedSymbol{"four"} - }) - }); - - AssertThat(rule, Equals(Rule::choice({Blank{}, NamedSymbol{"four"}}))); - }); - - it("eliminates duplicates within nested choices", [&]() { - Rule rule = Rule::choice({ - Rule::seq({ - NamedSymbol{"one"}, - NamedSymbol{"two"} - }), - Rule::choice({ - NamedSymbol{"three"}, - Rule::seq({ - NamedSymbol{"one"}, - NamedSymbol{"two"} - }) - }) - }); - - AssertThat(rule, Equals(Rule(Choice{{ - Rule::seq({ - NamedSymbol{"one"}, - NamedSymbol{"two"}, - }), - NamedSymbol{"three"}, - }}))); - }); - - it("doesn't construct a choice if there's only one unique member", [&]() { - Rule rule = Rule::choice({ - NamedSymbol{"one"}, - Rule::choice({ - NamedSymbol{"one"}, - }) - }); - - AssertThat(rule, Equals(Rule(NamedSymbol{"one"}))); - }); - }); -}); - -END_TEST diff --git a/test/compiler/util/string_helpers_test.cc b/test/compiler/util/string_helpers_test.cc deleted file mode 100644 index 62b883de..00000000 --- a/test/compiler/util/string_helpers_test.cc +++ /dev/null @@ -1,26 +0,0 @@ -#include "test_helper.h" -#include "compiler/util/string_helpers.h" - -using util::escape_char; - -START_TEST - -describe("escape_char", []() { - it("returns ascii characters as strings", [&]() { - AssertThat(escape_char('x'), Equals("'x'")); - }); - - it("escapes special characters with backslashes", [&]() { - AssertThat(escape_char('\\'), Equals("'\\\\'")); - AssertThat(escape_char('\n'), Equals("'\\n'")); - AssertThat(escape_char('\t'), Equals("'\\t'")); - AssertThat(escape_char('\r'), Equals("'\\r'")); - AssertThat(escape_char('\''), Equals("'\\''")); - }); - - it("prints non-ascii characters as numbers", [&]() { - AssertThat(escape_char(256), Equals("256")); - }); -}); - -END_TEST diff --git a/test/helpers/dedent.h b/test/helpers/dedent.h deleted file mode 100644 index 1387acf9..00000000 --- a/test/helpers/dedent.h +++ /dev/null @@ -1,12 +0,0 @@ -#include "compiler/util/string_helpers.h" -#include - -static std::string dedent(std::string input) { - size_t indent_level = input.find_first_not_of("\n ") - input.find_first_not_of("\n"); - std::string whitespace = "\n" + std::string(indent_level, ' '); - tree_sitter::util::str_replace(&input, whitespace, "\n"); - return input.substr( - input.find_first_not_of("\n "), - input.find_last_not_of("\n ") + 1 - ); -} diff --git a/test/helpers/file_helpers.cc b/test/helpers/file_helpers.cc deleted file mode 100644 index f2da6b77..00000000 --- a/test/helpers/file_helpers.cc +++ /dev/null @@ -1,100 +0,0 @@ -#include "helpers/file_helpers.h" -#include -#include -#include - -using std::string; -using std::ifstream; -using std::istreambuf_iterator; -using std::ofstream; -using std::vector; - -bool file_exists(const string &path) { - struct stat file_stat; - return stat(path.c_str(), &file_stat) == 0; -} - -int get_modified_time(const string &path) { - struct stat file_stat; - if (stat(path.c_str(), &file_stat) != 0) { - if (errno != ENOENT) - fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str()); - return 0; - } - return file_stat.st_mtime; -} - -string read_file(const string &path) { - struct stat file_stat; - if (stat(path.c_str(), &file_stat) != 0 || (file_stat.st_mode & S_IFMT) != S_IFREG) return ""; - ifstream file(path, std::ios::binary); - if (!file.good()) return ""; - istreambuf_iterator file_iterator(file), end_iterator; - string content(file_iterator, end_iterator); - file.close(); - return content; -} - -void write_file(const string &path, const string &content) { - ofstream file(path); - file << content; - file.close(); -} - -#ifdef _WIN32 - -#include - -const char *path_separator = "\\"; - -vector list_directory(const string &path) { - vector result; - - WIN32_FIND_DATA search_data; - HANDLE handle = FindFirstFile((path + "\\*").c_str(), &search_data); - while (handle != INVALID_HANDLE_VALUE) { - string name(search_data.cFileName); - result.push_back(name); - if (FindNextFile(handle, &search_data) == FALSE) break; - } - - return result; -} - -#else - -#include - -const char *path_separator = "/"; - -vector list_directory(const string &path) { - vector result; - - DIR *dir = opendir(path.c_str()); - if (!dir) { - printf("\nTest error - no such directory '%s'", path.c_str()); - return result; - } - - struct dirent *dir_entry; - while ((dir_entry = readdir(dir))) { - string name(dir_entry->d_name); - if (name != "." && name != "..") { - result.push_back(name); - } - } - - closedir(dir); - return result; -} - -#endif - -string join_path(const vector &parts) { - string result; - for (const string &part : parts) { - if (!result.empty()) result += path_separator; - result += part; - } - return result; -} diff --git a/test/helpers/file_helpers.h b/test/helpers/file_helpers.h deleted file mode 100644 index 23867367..00000000 --- a/test/helpers/file_helpers.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef HELPERS_FILE_HELPERS_H_ -#define HELPERS_FILE_HELPERS_H_ - -#include -#include -#include - -bool file_exists(const std::string &path); -int get_modified_time(const std::string &path); -std::string read_file(const std::string &path); -void write_file(const std::string &path, const std::string &content); -std::vector list_directory(const std::string &path); -std::string join_path(const std::vector &parts); - -#endif // HELPERS_FILE_HELPERS_H_ diff --git a/test/helpers/load_language.cc b/test/helpers/load_language.cc deleted file mode 100644 index c4bb982c..00000000 --- a/test/helpers/load_language.cc +++ /dev/null @@ -1,251 +0,0 @@ -#include "helpers/load_language.h" -#include "helpers/file_helpers.h" -#include -#include -#include -#include -#include -#include -#include "tree_sitter/compiler.h" - -using std::map; -using std::string; -using std::ifstream; -using std::ofstream; -using std::istreambuf_iterator; -using std::vector; -using std::to_string; - -map loaded_languages; -int libcompiler_mtime = -1; -int compile_result_count = 0; - -string get_language_function_name(string language_name) { - string result = "tree_sitter_"; - for (auto c : language_name) { - if (c == '-') { - result += '_'; - } else { - result += c; - } - } - return result; -} - -#ifdef _WIN32 - -#include - -const char *libcompiler_path = "test\\lib\\compiler.lib"; -const char *dylib_extension = ".dll"; - -static string get_cwd() { - string result(255, 0); - result.resize(GetCurrentDirectory(result.size(), &result[0])); - return result; -} - -static int compile_parser( - string source_filename, - string scanner_source_filename, - string output_filename, - string header_dirname -) { - CreateDirectory("out", nullptr); - CreateDirectory("out\\tmp", nullptr); - - string command = "cl.exe"; - command += " /nologo"; - command += " /LD"; - command += " /I " + header_dirname; - command += " /Od"; - command += " " + source_filename; - command += " " + scanner_source_filename; - command += " /link /out:" + output_filename; - return system(command.c_str()); -} - -static void *load_function_from_library(string library_path, string function_name) { - HINSTANCE library = LoadLibrary(library_path.c_str()); - if (!library) { - fputs(("Could not load library " + library_path).c_str(), stderr); - abort(); - } - - void *function = static_cast(GetProcAddress(library, function_name.c_str())); - if (!function) { - fputs(("Could not find function + " + function_name).c_str(), stderr); - abort(); - } - - return function; -} - -#else // POSIX - -#ifdef __linux - -const char *libcompiler_path = "out/Test/obj.target/libcompiler.a"; -const char *dylib_extension = ".so"; - -#else // macOS - -const char *libcompiler_path = "out/Test/libcompiler.a"; -const char *dylib_extension = ".dylib"; - -#endif - -#include -#include - -static string get_cwd() { - return string(getenv("PWD")); -} - -static int compile_parser( - string source_filename, - string scanner_source_filename, - string output_filename, - string header_dirname -) { - mkdir("out", 0777); - mkdir("out/tmp", 0777); - - const char *compiler_name = getenv("CXX"); - if (!compiler_name) compiler_name = "c++"; - - string command = compiler_name; - command += " -shared"; - command += " -fPIC "; - command += " -I " + header_dirname; - command += " -o " + output_filename; - command += " -O0"; - command += " -xc " + source_filename; - - if (!scanner_source_filename.empty()) { - command += " -g"; - string extension = scanner_source_filename.substr(scanner_source_filename.rfind(".")); - if (extension == ".c") { - command += " -xc " + scanner_source_filename; - } else { - command += " -xc++ " + scanner_source_filename; - } - } - - return system(command.c_str()); -} - -static void *load_function_from_library(string library_path, string function_name) { - void *parser_lib = dlopen(library_path.c_str(), RTLD_NOW); - if (!parser_lib) { - fputs(dlerror(), stderr); - abort(); - } - - void *language_function = dlsym(parser_lib, function_name.c_str()); - if (!language_function) { - fputs(dlerror(), stderr); - abort(); - } - - return language_function; -} - -#endif - -static const TSLanguage *load_language(const string &source_filename, - const string &lib_filename, - const string &language_name, - string external_scanner_filename = "") { - string language_function_name = get_language_function_name(language_name); - string header_dir = join_path({get_cwd(), "include"}); - int source_mtime = get_modified_time(source_filename); - int header_mtime = get_modified_time(join_path({header_dir, "tree_sitter", "parser.h"})); - int lib_mtime = get_modified_time(lib_filename); - int external_scanner_mtime = get_modified_time(external_scanner_filename); - - if (!header_mtime || lib_mtime < header_mtime || lib_mtime < source_mtime || - lib_mtime < external_scanner_mtime) { - const char *compiler_name = getenv("CXX"); - if (!compiler_name) compiler_name = "c++"; - - int status_code = compile_parser( - source_filename, - external_scanner_filename, - lib_filename, - header_dir - ); - - if (status_code != 0) abort(); - } - - void *language_function = load_function_from_library(lib_filename, language_function_name); - - return reinterpret_cast(language_function)(); -} - -const TSLanguage *load_test_language(const string &name, - const TSCompileResult &compile_result, - string external_scanner_path) { - if (compile_result.error_type != TSCompileErrorTypeNone) { - fputs((string("Compilation failed ") + compile_result.error_message).c_str(), stderr); - abort(); - } - - string source_filename = join_path({"out", "tmp", "compile-result-" + to_string(compile_result_count) + ".c"}); - string lib_filename = source_filename + dylib_extension; - compile_result_count++; - - ofstream source_file; - source_file.open(source_filename); - source_file << compile_result.code; - source_file.close(); - - auto language = load_language(source_filename, lib_filename, name, external_scanner_path); - free(compile_result.code); - return language; -} - -const TSLanguage *load_real_language(const string &language_name) { - if (loaded_languages[language_name]) - return loaded_languages[language_name]; - - string language_dir = join_path({"test", "fixtures", "grammars", language_name}); - string grammar_filename = join_path({language_dir, "src", "grammar.json"}); - string parser_filename = join_path({language_dir, "src", "parser.c"}); - string external_scanner_filename = join_path({language_dir, "src", "scanner.cc"}); - if (!file_exists(external_scanner_filename)) { - external_scanner_filename = join_path({language_dir, "src", "scanner.c"}); - if (!file_exists(external_scanner_filename)) { - external_scanner_filename = ""; - } - } - - int grammar_mtime = get_modified_time(grammar_filename); - if (!grammar_mtime) return nullptr; - - if (libcompiler_mtime == -1) { - libcompiler_mtime = get_modified_time(libcompiler_path); - if (!libcompiler_mtime) return nullptr; - } - - int parser_mtime = get_modified_time(parser_filename); - - if (parser_mtime <= grammar_mtime || parser_mtime <= libcompiler_mtime) { - printf("\n" "Regenerating the %s parser...\n", language_name.c_str()); - - string grammar_json = read_file(grammar_filename); - TSCompileResult result = ts_compile_grammar(grammar_json.c_str(), nullptr); - if (result.error_type != TSCompileErrorTypeNone) { - fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message); - return nullptr; - } - - write_file(parser_filename, result.code); - } - - string lib_filename = join_path({"out", "tmp", language_name + dylib_extension}); - const TSLanguage *language = load_language(parser_filename, lib_filename, language_name, external_scanner_filename); - loaded_languages[language_name] = language; - return language; -}; diff --git a/test/helpers/load_language.h b/test/helpers/load_language.h deleted file mode 100644 index c34a33ca..00000000 --- a/test/helpers/load_language.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef HELPERS_LOAD_LANGUAGE_H_ -#define HELPERS_LOAD_LANGUAGE_H_ - -#include "tree_sitter/compiler.h" -#include "tree_sitter/runtime.h" -#include - -const TSLanguage *load_real_language(const std::string &name); - -const TSLanguage *load_test_language(const std::string &name, - const TSCompileResult &compile_result, - std::string external_scanner_path = ""); - -#endif // HELPERS_LOAD_LANGUAGE_H_ diff --git a/test/helpers/point_helpers.cc b/test/helpers/point_helpers.cc deleted file mode 100644 index ace303a0..00000000 --- a/test/helpers/point_helpers.cc +++ /dev/null @@ -1,82 +0,0 @@ -#include "./point_helpers.h" -#include -#include -#include -#include "runtime/length.h" -#include "tree_sitter/runtime.h" - -using namespace std; - -bool operator==(const TSPoint &left, const TSPoint &right) { - return left.row == right.row && left.column == right.column; -} - -bool operator==(const TSRange &left, const TSRange &right) { - return ( - left.start_byte == right.start_byte && - left.end_byte == right.end_byte && - left.start_point == right.start_point && - left.end_point == right.end_point - ); -} - -bool operator==(const Length &left, const Length &right) { - return left.bytes == right.bytes && left.extent == right.extent; -} - -bool operator<(const TSPoint &left, const TSPoint &right) { - if (left.row < right.row) return true; - if (left.row > right.row) return false; - - return left.column < right.column; -} - -bool operator>(const TSPoint &left, const TSPoint &right) { - return right < left; -} - -Length operator*(const Length &length, uint32_t factor) { - return {length.bytes * factor, {0, length.extent.column * factor}}; -} - -Length operator+(const Length &left, const Length &right) { - return length_add(left, right); -} - -std::ostream &operator<<(std::ostream &stream, const TSPoint &point) { - return stream << "{" << point.row << ", " << point.column << "}"; -} - -std::ostream &operator<<(std::ostream &stream, const TSRange &range) { - return stream << "{" << range.start_point << ", " << range.end_point << "}"; -} - -ostream &operator<<(ostream &stream, const Length &length) { - return stream << "{bytes:" << length.bytes << ", extent:" << length.extent << "}"; -} - -TSPoint extent_for_string(const string &text, size_t end_index) { - if (end_index > text.size()) end_index = text.size(); - TSPoint result = {0, 0}; - for (size_t i = 0; i < end_index; i++) { - if (text[i] == '\n') { - result.row++; - result.column = 0; - } else { - result.column++; - } - } - return result; -} - -TSRange range_for_substring(const string &text, const string &substring) { - size_t start = text.find(substring); - assert(start != string::npos); - size_t end = start + substring.size(); - return TSRange { - extent_for_string(text, start), - extent_for_string(text, end), - static_cast(start), - static_cast(end), - }; -}; diff --git a/test/helpers/point_helpers.h b/test/helpers/point_helpers.h deleted file mode 100644 index a64abc0d..00000000 --- a/test/helpers/point_helpers.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef HELPERS_POINT_HELPERS_H_ -#define HELPERS_POINT_HELPERS_H_ - -#include "runtime/length.h" -#include - -bool operator==(const TSPoint &left, const TSPoint &right); - -bool operator<(const TSPoint &left, const TSPoint &right); - -bool operator>(const TSPoint &left, const TSPoint &right); - -bool operator==(const TSRange &left, const TSRange &right); - -bool operator==(const Length &left, const Length &right); - -Length operator*(const Length &length, uint32_t factor); - -Length operator+(const Length &left, const Length &right); - -std::ostream &operator<<(std::ostream &stream, const TSPoint &point); - -std::ostream &operator<<(std::ostream &stream, const TSRange &range); - -std::ostream &operator<<(std::ostream &stream, const Length &length); - -TSPoint extent_for_string(const std::string &text, size_t end_index = std::string::npos); - -TSRange range_for_substring(const std::string &text, const std::string &substring); - -#endif // HELPERS_POINT_HELPERS_H_ diff --git a/test/helpers/random_helpers.cc b/test/helpers/random_helpers.cc deleted file mode 100644 index 480e18d2..00000000 --- a/test/helpers/random_helpers.cc +++ /dev/null @@ -1,73 +0,0 @@ -#include "helpers/random_helpers.h" -#include -#include -#include -#include - -using std::string; -using std::vector; - -unsigned get_time_as_seed() { - return time(nullptr); -} - -void Generator::reseed(unsigned seed) { - engine.seed(seed); -} - -unsigned Generator::operator()() { - return distribution(engine); -} - -unsigned Generator::operator()(unsigned max) { - return distribution(engine) % max; -} - -string Generator::str(char min, char max) { - string result; - size_t length = operator()(12); - for (size_t i = 0; i < length; i++) { - result += (min + operator()(max - min)); - } - return result; -} - -static string operator_characters = "!(){}[]<>+-="; - -string Generator::words(size_t count) { - string result; - bool just_inserted_word = false; - for (size_t i = 0; i < count; i++) { - if (operator()(10) < 6) { - result += operator_characters[operator()(operator_characters.size())]; - } else { - if (just_inserted_word) - result += " "; - result += str('a', 'z'); - just_inserted_word = true; - } - } - return result; -} - -string Generator::select(const vector &list) { - return list[operator()(list.size())]; -} - -#ifdef _WIN32 - -#include - -void Generator::sleep_some() { - Sleep(operator()(5)); -} - -#else - -#include - -void Generator::sleep_some() { - usleep(operator()(5 * 1000)); -} - -#endif diff --git a/test/helpers/random_helpers.h b/test/helpers/random_helpers.h deleted file mode 100644 index f5813d29..00000000 --- a/test/helpers/random_helpers.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef HELPERS_RANDOM_HELPERS_H_ -#define HELPERS_RANDOM_HELPERS_H_ - -#include -#include -#include - -unsigned get_time_as_seed(); - -class Generator { - std::default_random_engine engine; - std::uniform_int_distribution distribution; - -public: - Generator(uint32_t seed) : engine{seed} {} - - void reseed(unsigned); - unsigned operator()(); - unsigned operator()(unsigned max); - std::string words(size_t count); - std::string str(char min, char max); - std::string select(const std::vector &); - void sleep_some(); -}; - -#endif // HELPERS_RANDOM_HELPERS_H_ diff --git a/test/helpers/read_test_entries.cc b/test/helpers/read_test_entries.cc deleted file mode 100644 index fdeb99f7..00000000 --- a/test/helpers/read_test_entries.cc +++ /dev/null @@ -1,105 +0,0 @@ -#include "helpers/read_test_entries.h" -#include -#include -#include -#include "helpers/file_helpers.h" - -using std::move; -using std::regex; -using std::regex_search; -using std::regex_replace; -using std::regex_constants::extended; -using std::smatch; -using std::string; -using std::vector; - -static string trim_output(const string &input) { - string result(input); - result = regex_replace(result, regex("[\n\t ]+", extended), string(" ")); - result = regex_replace(result, regex("^ ", extended), string("")); - result = regex_replace(result, regex(" $", extended), string("")); - result = regex_replace(result, regex("\\) \\)", extended), string("))")); - return result; -} - -static vector parse_test_entries(string content) { - regex header_pattern("(^|\n)===+\n" "([^=]+)\n" "===+\n", extended); - regex separator_pattern("---+\r?\n", extended); - vector descriptions; - vector bodies; - - for (;;) { - smatch matches; - if (!regex_search(content, matches, header_pattern) || matches.empty()) - break; - - string description = matches[2].str(); - descriptions.push_back(description); - - if (!bodies.empty()) - bodies.back().erase(matches.position()); - content.erase(0, matches.position() + matches[0].length()); - bodies.push_back(content); - } - - vector result; - for (size_t i = 0; i < descriptions.size(); i++) { - string body = bodies[i]; - smatch matches; - if (regex_search(body, matches, separator_pattern)) { - result.push_back({ - descriptions[i], - body.substr(0, matches.position() - 1), - trim_output(body.substr(matches.position() + matches[0].length())) - }); - } else { - puts(("Invalid corpus entry with description: " + descriptions[i]).c_str()); - abort(); - } - } - - return result; -} - -vector read_real_language_corpus(string language_name) { - vector result; - - string corpus_directory = join_path({"test", "fixtures", "grammars", language_name, "corpus"}); - for (string &test_filename : list_directory(corpus_directory)) { - for (TestEntry &entry : parse_test_entries(read_file(join_path({corpus_directory, test_filename})))) { - result.push_back(entry); - } - } - - string error_test_filename = join_path({"test", "fixtures", "error_corpus", language_name + "_errors.txt"}); - for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) { - result.push_back(entry); - } - - return result; -} - -vector read_test_language_corpus(string language_name) { - vector result; - - string test_directory = join_path({"test", "fixtures", "test_grammars", language_name}); - for (string &test_filename : list_directory(test_directory)) { - for (TestEntry &entry : parse_test_entries(read_file(join_path({test_directory, test_filename})))) { - result.push_back(entry); - } - } - - return result; -} - -vector examples_for_language(string language_name) { - vector result; - string examples_directory = join_path({"test", "fixtures", "grammars", language_name, "examples"}); - for (string &filename : list_directory(examples_directory)) { - auto content = read_file(join_path({examples_directory, filename})); - if (!content.empty()) { - result.push_back({filename, move(content)}); - } - } - return result; -} diff --git a/test/helpers/read_test_entries.h b/test/helpers/read_test_entries.h deleted file mode 100644 index 016b19b4..00000000 --- a/test/helpers/read_test_entries.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef HELPERS_READ_TEST_ENTRIES_H_ -#define HELPERS_READ_TEST_ENTRIES_H_ - -#include -#include - -struct TestEntry { - std::string description; - std::string input; - std::string tree_string; -}; - -struct ExampleEntry { - std::string file_name; - std::string input; -}; - -std::vector read_real_language_corpus(std::string name); -std::vector read_test_language_corpus(std::string name); -std::vector examples_for_language(std::string name); - -#endif diff --git a/test/helpers/record_alloc.cc b/test/helpers/record_alloc.cc deleted file mode 100644 index 2e2ea648..00000000 --- a/test/helpers/record_alloc.cc +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include -#include - -using std::map; -using std::vector; - -static bool _enabled = false; -static size_t _allocation_count = 0; -static map _outstanding_allocations; -static std::mutex _outstanding_allocations_mutex; -static bool _multi_threaded_mode = false; - -namespace record_alloc { - -void start(bool multi_threaded_mode) { - _enabled = true; - _allocation_count = 0; - _outstanding_allocations.clear(); - _multi_threaded_mode = multi_threaded_mode; -} - -void stop() { - _enabled = false; -} - -vector outstanding_allocation_indices() { - vector result; - for (const auto &entry : _outstanding_allocations) { - result.push_back(entry.second); - } - return result; -} - -size_t allocation_count() { - size_t result; - _outstanding_allocations_mutex.lock(); - result = _allocation_count; - _outstanding_allocations_mutex.unlock(); - return result; -} - -} // namespace record_alloc - -extern "C" { - -static void *record_allocation(void *result) { - if (!_enabled) return result; - if (_multi_threaded_mode) _outstanding_allocations_mutex.lock(); - _outstanding_allocations[result] = _allocation_count; - _allocation_count++; - if (_multi_threaded_mode) _outstanding_allocations_mutex.unlock(); - return result; -} - -static void record_deallocation(void *pointer) { - if (_multi_threaded_mode) _outstanding_allocations_mutex.lock(); - auto entry = _outstanding_allocations.find(pointer); - if (entry != _outstanding_allocations.end()) { - _outstanding_allocations.erase(entry); - } - if (_multi_threaded_mode) _outstanding_allocations_mutex.unlock(); -} - -void *ts_record_malloc(size_t size) { - return record_allocation(malloc(size)); -} - -void *ts_record_realloc(void *pointer, size_t size) { - record_deallocation(pointer); - return record_allocation(realloc(pointer, size)); -} - -void *ts_record_calloc(size_t count, size_t size) { - return record_allocation(calloc(count, size)); -} - -void ts_record_free(void *pointer) { - record_deallocation(pointer); - free(pointer); -} - -bool ts_record_allocations_toggle(bool value) { - bool previous_value = _enabled; - _enabled = value; - return previous_value; -} - -} diff --git a/test/helpers/record_alloc.h b/test/helpers/record_alloc.h deleted file mode 100644 index f21876b4..00000000 --- a/test/helpers/record_alloc.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef HELPERS_RECORD_ALLOC_H_ -#define HELPERS_RECORD_ALLOC_H_ - -#include - -namespace record_alloc { - -void start(bool multi_threaded_mode = false); -void stop(); -void fail_at_allocation_index(size_t failure_index); -std::vector outstanding_allocation_indices(); -size_t allocation_count(); - -} // namespace record_alloc - -#endif // HELPERS_RECORD_ALLOC_H_ diff --git a/test/helpers/scope_sequence.cc b/test/helpers/scope_sequence.cc deleted file mode 100644 index 34109b76..00000000 --- a/test/helpers/scope_sequence.cc +++ /dev/null @@ -1,106 +0,0 @@ -#include "./scope_sequence.h" - -#include "bandit/bandit.h" -#include -#include "helpers/stream_methods.h" -#include "helpers/point_helpers.h" - -using std::string; -using std::cout; -using namespace snowhouse; - -static void append_text_to_scope_sequence(ScopeSequence *sequence, - ScopeStack *current_scopes, - const std::string &text, - size_t length) { - for (size_t i = 0; i < length; i++) { - string character(1, text[sequence->size()]); - sequence->push_back(*current_scopes); - sequence->back().push_back("'" + character + "'"); - } -} - -static void append_to_scope_sequence(ScopeSequence *sequence, - ScopeStack *current_scopes, - TSNode node, const std::string &text) { - append_text_to_scope_sequence( - sequence, current_scopes, text, ts_node_start_byte(node) - sequence->size() - ); - - current_scopes->push_back(ts_node_type(node)); - - for (size_t i = 0, n = ts_node_child_count(node); i < n; i++) { - TSNode child = ts_node_child(node, i); - append_to_scope_sequence(sequence, current_scopes, child, text); - } - - append_text_to_scope_sequence( - sequence, current_scopes, text, ts_node_end_byte(node) - sequence->size() - ); - - current_scopes->pop_back(); -} - -ScopeSequence build_scope_sequence(TSTree *tree, const std::string &text) { - ScopeSequence sequence; - ScopeStack current_scopes; - TSNode node = ts_tree_root_node(tree); - append_to_scope_sequence(&sequence, ¤t_scopes, node, text); - return sequence; -} - -bool operator<=(const TSPoint &left, const TSPoint &right) { - if (left.row < right.row) - return true; - else if (left.row == right.row) - return left.column <= right.column; - else - return false; -} - -void verify_changed_ranges(const ScopeSequence &old_sequence, const ScopeSequence &new_sequence, - const string &text, TSRange *ranges, size_t range_count) { - TSPoint current_position = {0, 0}; - for (size_t i = 0; i < old_sequence.size(); i++) { - if (text[i] == '\n') { - current_position.row++; - current_position.column = 0; - continue; - } - - const ScopeStack &old_scopes = old_sequence[i]; - const ScopeStack &new_scopes = new_sequence[i]; - if (old_scopes != new_scopes) { - bool found_containing_range = false; - for (size_t j = 0; j < range_count; j++) { - TSRange range = ranges[j]; - if (range.start_point <= current_position && current_position <= range.end_point) { - found_containing_range = true; - break; - } - } - - if (!found_containing_range) { - std::stringstream message_stream; - message_stream << "Found changed scope outside of any invalidated range;\n"; - message_stream << "Position: " << current_position << "\n"; - message_stream << "Byte index: " << i << "\n"; - size_t line_start_index = i - current_position.column; - size_t line_end_index = text.find_first_of('\n', i); - message_stream << "Line: " << text.substr(line_start_index, line_end_index - line_start_index) << "\n"; - for (size_t j = 0; j < current_position.column + string("Line: ").size(); j++) - message_stream << " "; - message_stream << "^\n"; - message_stream << "Old scopes: " << old_scopes << "\n"; - message_stream << "New scopes: " << new_scopes << "\n"; - message_stream << "Invalidated ranges:\n"; - for (size_t j = 0; j < range_count; j++) { - message_stream << " " << ranges[j] << "\n"; - } - Assert::Failure(message_stream.str()); - } - } - - current_position.column++; - } -} diff --git a/test/helpers/scope_sequence.h b/test/helpers/scope_sequence.h deleted file mode 100644 index 2ad15117..00000000 --- a/test/helpers/scope_sequence.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef HELPERS_SCOPE_SEQUENCE_H_ -#define HELPERS_SCOPE_SEQUENCE_H_ - -#include -#include -#include "tree_sitter/runtime.h" - -typedef std::string Scope; -typedef std::vector ScopeStack; -typedef std::vector ScopeSequence; - -ScopeSequence build_scope_sequence(TSTree *tree, const std::string &text); - -void verify_changed_ranges(const ScopeSequence &old, const ScopeSequence &new_sequence, const std::string &text, TSRange *ranges, size_t range_count); - -#endif // HELPERS_SCOPE_SEQUENCE_H_ diff --git a/test/helpers/spy_input.cc b/test/helpers/spy_input.cc deleted file mode 100644 index 86fc80c5..00000000 --- a/test/helpers/spy_input.cc +++ /dev/null @@ -1,144 +0,0 @@ -#include "helpers/spy_input.h" -#include "helpers/point_helpers.h" -#include "runtime/point.h" -#include -#include -#include - -using std::pair; -using std::string; -using std::vector; - -SpyInput::SpyInput(string content, size_t chars_per_chunk) : - buffer(nullptr), - chars_per_chunk(chars_per_chunk), - content(content), - encoding(TSInputEncodingUTF8), - ranges_read({}) {} - -SpyInput::~SpyInput() { - delete[] buffer; -} - -static TSPoint operator+(TSPoint a, TSPoint b) { - if (b.row > 0) { - return TSPoint {a.row + b.row, b.column}; - } else { - return TSPoint {a.row, a.column + b.column}; - } -} - -static void add_byte_range(vector> *ranges, - uint32_t start, uint32_t count) { - uint32_t end = start + count; - for (auto &range : *ranges) { - if (range.first <= start && start <= range.second) { - if (start < range.first) range.first = start; - if (end > range.second) range.second = end; - return; - } - } - ranges->push_back({start, end}); -} - -const char *SpyInput::read(void *payload, uint32_t byte_offset, - TSPoint position, uint32_t *bytes_read) { - auto spy = static_cast(payload); - - unsigned end_byte = byte_offset + spy->chars_per_chunk; - if (end_byte > spy->content.size()) { - end_byte = spy->content.size(); - } - - *bytes_read = end_byte - byte_offset; - add_byte_range(&spy->ranges_read, byte_offset, *bytes_read); - - /* - * This class stores its entire `content` in a contiguous buffer, but we want - * to ensure that the code under test cannot accidentally read more than - * `*bytes_read` bytes past the returned pointer. To make sure that this type - * of error does not fly, we allocate a separate buffer for each request and - * return a reference to that buffer, rather than a pointer into the main - * content. The temporary buffer only fits `*bytes_read` bytes so valgrind - * can detect code reading too many bytes from the buffer. - */ - delete[] spy->buffer; - if (*bytes_read) { - spy->buffer = new char[*bytes_read](); - memcpy(spy->buffer, spy->content.data() + byte_offset, *bytes_read); - } else { - spy->buffer = nullptr; - } - - return spy->buffer; -} - -vector SpyInput::strings_read() const { - vector result; - for (auto &range : ranges_read) { - result.push_back(content.substr(range.first, range.second - range.first)); - } - return result; -} - -TSInput SpyInput::input() { - TSInput result; - result.payload = this; - result.encoding = encoding; - result.read = read; - return result; -} - -TSInputEdit SpyInput::replace(size_t start_byte, size_t bytes_removed, string text) { - auto swap = swap_substr(start_byte, bytes_removed, text); - size_t bytes_added = text.size(); - undo_stack.push_back(SpyInputEdit{start_byte, bytes_added, swap.first}); - TSInputEdit result = {}; - result.start_byte = start_byte; - result.old_end_byte = start_byte + bytes_removed; - result.new_end_byte = start_byte + bytes_added; - result.start_point = swap.second; - result.old_end_point = result.start_point + extent_for_string(swap.first); - result.new_end_point = result.start_point + extent_for_string(text); - return result; -} - -bool SpyInput::can_undo() const { - return !undo_stack.empty(); -} - -TSInputEdit SpyInput::undo() { - SpyInputEdit entry = undo_stack.back(); - undo_stack.pop_back(); - auto swap = swap_substr(entry.start_byte, entry.bytes_removed, entry.text_inserted); - TSInputEdit result; - result.start_byte = entry.start_byte; - result.old_end_byte = entry.start_byte + entry.bytes_removed; - result.new_end_byte = entry.start_byte + entry.text_inserted.size(); - result.start_point = swap.second; - result.old_end_point = result.start_point + extent_for_string(swap.first); - result.new_end_point = result.start_point + extent_for_string(entry.text_inserted); - return result; -} - -pair SpyInput::swap_substr(size_t start_byte, size_t bytes_removed, string text) { - TSPoint start_position = {0, 0}; - for (auto i = content.begin(), n = content.begin() + start_byte; i < n; i++) { - if (*i == '\n') { - start_position.row++; - start_position.column = 0; - } else { - start_position.column++; - } - } - - string text_removed = content.substr(start_byte, bytes_removed); - content.erase(start_byte, bytes_removed); - content.insert(start_byte, text); - - return {text_removed, start_position}; -} - -void SpyInput::clear() { - ranges_read.clear(); -} diff --git a/test/helpers/spy_input.h b/test/helpers/spy_input.h deleted file mode 100644 index a1f67c18..00000000 --- a/test/helpers/spy_input.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef HELPERS_SPY_INPUT_H_ -#define HELPERS_SPY_INPUT_H_ - -#include -#include -#include "tree_sitter/runtime.h" - -struct SpyInputEdit { - size_t start_byte; - size_t bytes_removed; - std::string text_inserted; -}; - -class SpyInput { - char *buffer; - std::vector undo_stack; - - static const char *read(void *, uint32_t, TSPoint, uint32_t *); - std::pair swap_substr(size_t, size_t, std::string); - - public: - SpyInput(std::string content, size_t chars_per_chunk); - ~SpyInput(); - - TSInput input(); - void clear(); - TSInputEdit replace(size_t start_char, size_t chars_removed, std::string text); - bool can_undo() const; - TSInputEdit undo(); - std::vector strings_read() const; - - uint32_t chars_per_chunk; - std::string content; - TSInputEncoding encoding; - std::vector> ranges_read; -}; - -#endif // HELPERS_SPY_INPUT_H_ diff --git a/test/helpers/spy_logger.cc b/test/helpers/spy_logger.cc deleted file mode 100644 index 7e3a92a1..00000000 --- a/test/helpers/spy_logger.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "helpers/spy_logger.h" -#include -#include - -using std::string; -using std::vector; - -static void spy_log(void *data, TSLogType type, const char *msg) { - SpyLogger *logger = static_cast(data); - logger->messages.push_back(msg); -} - -TSLogger SpyLogger::logger() { - TSLogger result; - result.payload = (void *)this; - result.log = spy_log; - return result; -} - -void SpyLogger::clear() { - messages.clear(); -} diff --git a/test/helpers/spy_logger.h b/test/helpers/spy_logger.h deleted file mode 100644 index 9f98fcb8..00000000 --- a/test/helpers/spy_logger.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef HELPERS_SPY_LOGGER_H_ -#define HELPERS_SPY_LOGGER_H_ - -#include -#include -#include "tree_sitter/runtime.h" - -class SpyLogger { - public: - void clear(); - TSLogger logger(); - std::vector messages; -}; - -#endif // HELPERS_SPY_DEBUGGER_H_ diff --git a/test/helpers/stderr_logger.cc b/test/helpers/stderr_logger.cc deleted file mode 100644 index a6ebbe61..00000000 --- a/test/helpers/stderr_logger.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "tree_sitter/runtime.h" -#include - -static void log(void *payload, TSLogType type, const char *msg) { - bool include_lexing = (bool)payload; - switch (type) { - case TSLogTypeParse: - fprintf(stderr, "* %s\n", msg); - break; - case TSLogTypeLex: - if (include_lexing) - fprintf(stderr, " %s\n", msg); - break; - } -} - -TSLogger stderr_logger_new(bool include_lexing) { - TSLogger result; - result.payload = (void *)include_lexing; - result.log = log; - return result; -} diff --git a/test/helpers/stderr_logger.h b/test/helpers/stderr_logger.h deleted file mode 100644 index 9c88b21d..00000000 --- a/test/helpers/stderr_logger.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef HELPERS_STDERR_LOGGER_H_ -#define HELPERS_STDERR_LOGGER_H_ - -#include "tree_sitter/runtime.h" - -TSLogger stderr_logger_new(bool include_lexing); - -#endif // HELPERS_STDERR_LOGGER_H_ diff --git a/test/helpers/stream_methods.cc b/test/helpers/stream_methods.cc deleted file mode 100644 index 3e5ed020..00000000 --- a/test/helpers/stream_methods.cc +++ /dev/null @@ -1,234 +0,0 @@ -#include "helpers/stream_methods.h" -#include "test_helper.h" -#include "tree_sitter/compiler.h" -#include "compiler/util/string_helpers.h" -#include "compiler/parse_table.h" -#include "compiler/syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/build_tables/parse_item.h" -#include "compiler/build_tables/lex_item.h" -#include "helpers/point_helpers.h" - -ostream &operator<<(ostream &stream, const TSInputEdit &edit) { - return stream << "{TSInputEdit start_byte: " << edit.start_byte << - ", old_end_byte: " << edit.old_end_byte << - ", new_end_byte: " << edit.new_end_byte << - ", start_point: " << edit.start_point << - ", old_end_point: " << edit.old_end_point << - ", new_end_point: " << edit.new_end_point << "}"; -} - -namespace tree_sitter { - -ostream &operator<<(ostream &stream, const InputGrammar &grammar) { - return stream << "(InputGrammar variables: " << grammar.variables << ")"; -} - -ostream &operator<<(ostream &stream, const CompileError &error) { - if (error.type) { - return stream << "(CompileError " << error.message << ")"; - } else { - return stream << "(No CompileError)"; - } -} - -namespace rules { - -ostream &operator<<(ostream &stream, Associativity associativity) { - switch (associativity) { - case AssociativityLeft: - return stream << "AssociativityLeft"; - case AssociativityRight: - return stream << "AssociativityRight"; - default: - return stream << "AssociativityNone"; - } -} - -ostream &operator<<(ostream &stream, const Blank &) { - return stream << "(Blank)"; -} - -ostream &operator<<(ostream &stream, const CharacterRange &range) { - if (range.min == range.max) { - return stream << util::escape_char(range.min); - } else { - return stream << "(" + util::escape_char(range.min) << "-" << util::escape_char(range.max) << ")"; - } -} - -ostream &operator<<(ostream &stream, const CharacterSet &rule) { - stream << "(CharacterSet"; - if (rule.includes_all) { - if (rule.excluded_chars.empty()) { - stream << " all"; - } else { - stream << " exclude"; - for (const auto &range : rule.excluded_ranges()) { - stream << " " << range; - } - } - } else { - for (const auto &range : rule.included_ranges()) { - stream << " " << range; - } - } - return stream << ")"; -} - -ostream &operator<<(ostream &stream, const Symbol &rule) { - stream << "(Symbol "; - switch (rule.type) { - case Symbol::External: - stream << "external"; - break; - case Symbol::Terminal: - stream << "terminal"; - break; - case Symbol::NonTerminal: - stream << "non-terminal"; - break; - } - return stream << " " << rule.index << ")"; -} - -ostream &operator<<(ostream &stream, const NamedSymbol &rule) { - return stream << "(NamedSymbol " << rule.value << ")"; -} - -ostream &operator<<(ostream &stream, const String &rule) { - return stream << "(String " << rule.value << ")"; -} - -ostream &operator<<(ostream &stream, const Pattern &rule) { - return stream << "(Pattern " << rule.value << ")"; -} - -ostream &operator<<(ostream &stream, const Choice &rule) { - stream << "(Choice"; - for (const auto &element : rule.elements) { - stream << " " << element; - } - return stream << ")"; -} - -ostream &operator<<(ostream &stream, const Seq &rule) { - return stream << "(Seq " << *rule.left << " " << *rule.right << ")"; -} - -ostream &operator<<(ostream &stream, const Repeat &rule) { - return stream << "(Repeat " << *rule.rule << ")"; -} - -ostream &operator<<(ostream &stream, const Metadata &rule) { - stream << "(Metadata"; - if (rule.params.has_precedence) stream << " prec=" << to_string(rule.params.precedence); - if (rule.params.has_associativity) stream << " assoc=" << rule.params.associativity; - if (rule.params.is_token) stream << " token"; - if (rule.params.is_main_token) stream << " main"; - return stream << " " << *rule.rule << ")"; -} - -ostream &operator<<(ostream &stream, const Rule &rule) { - rule.match( - [&stream](Blank r) { stream << r; }, - [&stream](NamedSymbol r) { stream << r; }, - [&stream](Symbol r) { stream << r; }, - [&stream](String r) { stream << r; }, - [&stream](Pattern r) { stream << r; }, - [&stream](CharacterSet r) { stream << r; }, - [&stream](Choice r) { stream << r; }, - [&stream](Seq r) { stream << r; }, - [&stream](Repeat r) { stream << r; }, - [&stream](Metadata r) { stream << r; } - ); - return stream; -} - -} // namespace rules - -ostream &operator<<(ostream &stream, const Variable &variable) { - return stream << "(Variable " << variable.name << " " << variable.rule << ")"; -} - -ostream &operator<<(ostream &stream, const Production &production) { - return stream << "(Production " << production.steps << " " << - to_string(production.dynamic_precedence) << ")"; -} - -ostream &operator<<(ostream &stream, const SyntaxVariable &variable) { - return stream << "(Variable " << variable.name << " " << variable.productions << - " " << to_string(variable.type) << ")"; -} - -ostream &operator<<(ostream &stream, const LexicalVariable &variable) { - return stream << "(Variable " << variable.name << " " << to_string(variable.type) << - " " << variable.rule << ")"; -} - -ostream &operator<<(ostream &stream, const ExternalToken &external_token) { - return stream << "(ExternalToken " << external_token.name << " " << - external_token.type << " " << external_token.corresponding_internal_token << ")"; -} - -ostream &operator<<(ostream &stream, const ProductionStep &step) { - return stream << "(ProductionStep " << step.symbol << " precedence:" << - to_string(step.precedence) << " associativity:" << step.associativity << ")"; -} - -ostream &operator<<(ostream &stream, const PrecedenceRange &range) { - if (range.empty) { - return stream << "(PrecedenceRange)"; - } else { - return stream << "(PrecedenceRange " << to_string(range.min) << " " << - to_string(range.max) << ")"; - } -} - -namespace build_tables { - -ostream &operator<<(ostream &stream, const LexItem &item) { - return stream << "(LexItem " << item.lhs << " " << item.rule << ")"; -} - -ostream &operator<<(ostream &stream, const LexItemSet &item_set) { - return stream << item_set.entries; -} - -ostream &operator<<(ostream &stream, const ParseItem &item) { - stream << "(ParseItem " << item.lhs() << " ->"; - for (size_t i = 0; i < item.production->size(); i++) { - if (i == item.step_index) { - stream << " •"; - } - stream << " " << item.production->at(i).symbol << " " << item.production->at(i).precedence << - " " << (int)item.production->at(i).associativity; - } - - if (item.step_index == item.production->size()) { - stream << " • "; - } - - return stream << ")"; -} - -ostream &operator<<(ostream &stream, const ParseItemSet &item_set) { - return stream << item_set.entries; -} - -ostream &operator<<(ostream &stream, const LookaheadSet &lookaheads) { - stream << "(LookaheadSet"; - lookaheads.for_each([&stream](Symbol symbol) { - stream << " " << symbol; - return true; - }); - return stream << ")"; -} - -ostream &operator<<(ostream &stream, const LexItemSet::Transition &transition) { - return stream << "(Transition " << transition.destination << " prec:" << transition.precedence << ")"; -} - -} // namespace build_tables - -} // namespace tree_sitter diff --git a/test/helpers/stream_methods.h b/test/helpers/stream_methods.h deleted file mode 100644 index 66c86d52..00000000 --- a/test/helpers/stream_methods.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef HELPERS_STREAM_METHODS_H_ -#define HELPERS_STREAM_METHODS_H_ - -#include -#include -#include -#include -#include -#include -#include "tree_sitter/runtime.h" -#include "compiler/grammar.h" -#include "compiler/prepare_grammar/interned_grammar.h" -#include "compiler/prepare_grammar/initial_syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/syntax_grammar.h" -#include "compiler/rule.h" -#include "compiler/compile_error.h" -#include "compiler/build_tables/lex_item.h" - -using std::cout; - -namespace std { - -template -inline std::ostream& operator<<(std::ostream &stream, const std::vector &vector) { - stream << std::string("(vector: "); - bool started = false; - for (auto item : vector) { - if (started) stream << std::string(", "); - stream << item; - started = true; - } - return stream << ")"; -} - -template -inline std::ostream& operator<<(std::ostream &stream, const std::set &set) { - stream << std::string("(set: "); - bool started = false; - for (auto item : set) { - if (started) stream << std::string(", "); - stream << item; - started = true; - } - return stream << ")"; -} - -template -inline std::ostream& operator<<(std::ostream &stream, const std::unordered_set &set) { - stream << std::string("(set: "); - bool started = false; - for (auto item : set) { - if (started) stream << std::string(", "); - stream << item; - started = true; - } - return stream << ")"; -} - -template -inline std::ostream& operator<<(std::ostream &stream, const std::map &map) { - stream << std::string("(map: "); - bool started = false; - for (auto pair : map) { - if (started) stream << std::string(", "); - stream << pair.first; - stream << std::string(" => "); - stream << pair.second; - started = true; - } - return stream << ")"; -} - -template -inline std::ostream& operator<<(std::ostream &stream, const std::unordered_map &map) { - stream << std::string("(map: "); - bool started = false; - for (auto pair : map) { - if (started) stream << std::string(", "); - stream << pair.first; - stream << std::string(" => "); - stream << pair.second; - started = true; - } - return stream << ")"; -} - -template -inline std::ostream& operator<<(std::ostream &stream, const std::pair &pair) { - return stream << "{" << pair.first << ", " << pair.second << "}"; -} - -} // namespace std - -std::ostream &operator<<(std::ostream &, const TSInputEdit &); - -namespace tree_sitter { - -using std::ostream; -using std::string; -using std::to_string; - -struct InputGrammar; -struct AdvanceAction; -struct AcceptTokenAction; -struct ParseAction; -struct ParseState; -struct ExternalToken; -struct ProductionStep; -struct PrecedenceRange; - -ostream &operator<<(ostream &, const InputGrammar &); -ostream &operator<<(ostream &, const CompileError &); -ostream &operator<<(ostream &, const ExternalToken &); -ostream &operator<<(ostream &, const ProductionStep &); -ostream &operator<<(ostream &, const Production &); -ostream &operator<<(ostream &, const PrecedenceRange &); -ostream &operator<<(ostream &, const Variable &); -ostream &operator<<(ostream &, const LexicalVariable &); - -namespace rules { - -ostream &operator<<(ostream &, const Blank &); -ostream &operator<<(ostream &, const CharacterRange &); -ostream &operator<<(ostream &, const CharacterSet &); -ostream &operator<<(ostream &, const Symbol &); -ostream &operator<<(ostream &, const NamedSymbol &); -ostream &operator<<(ostream &, const String &); -ostream &operator<<(ostream &, const Pattern &); -ostream &operator<<(ostream &stream, const Choice &rule); -ostream &operator<<(ostream &stream, const Seq &rule); -ostream &operator<<(ostream &stream, const Repeat &rule); -ostream &operator<<(ostream &stream, const Metadata &rule); -ostream &operator<<(ostream &stream, const Rule &rule); - -} // namespace rules - -namespace build_tables { - -class LexItem; -class LexItemSet; -struct ParseItem; -struct ParseItemSet; -class LookaheadSet; - -ostream &operator<<(ostream &, const LexItem &); -ostream &operator<<(ostream &, const LexItemSet &); -ostream &operator<<(ostream &, const LexItemSet::Transition &); -ostream &operator<<(ostream &, const ParseItem &); -ostream &operator<<(ostream &, const ParseItemSet &); -ostream &operator<<(ostream &, const LookaheadSet &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // HELPERS_STREAM_METHODS_H_ diff --git a/test/helpers/tree_helpers.cc b/test/helpers/tree_helpers.cc deleted file mode 100644 index cf4341bc..00000000 --- a/test/helpers/tree_helpers.cc +++ /dev/null @@ -1,129 +0,0 @@ -#include "test_helper.h" -#include "helpers/tree_helpers.h" -#include "helpers/point_helpers.h" -#include -#include - -using std::string; -using std::vector; -using std::to_string; -using std::ostream; - -const char *symbol_names[24] = { - "ERROR", "END", "two", "three", "four", "five", "six", "seven", "eight", - "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", - "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twenty-one", - "twenty-two", "twenty-three" -}; - -SubtreeArray *tree_array(vector trees) { - static SubtreeArray result; - result.capacity = trees.size(); - result.size = trees.size(); - result.contents = (Subtree *)calloc(trees.size(), sizeof(Subtree)); - for (size_t i = 0; i < trees.size(); i++) { - result.contents[i] = trees[i]; - } - return &result; -} - -ostream &operator<<(std::ostream &stream, Subtree tree) { - static TSLanguage DUMMY_LANGUAGE = {}; - DUMMY_LANGUAGE.symbol_names = symbol_names; - char *string = ts_subtree_string(tree, &DUMMY_LANGUAGE, false); - stream << string; - ts_free(string); - return stream; -} - -ostream &operator<<(ostream &stream, const TSNode &node) { - if (ts_node_is_null(node)) { - return stream << "NULL"; - } else { - char *string = ts_node_string(node); - stream << "{" << string << ", " << to_string(ts_node_start_byte(node)) << "}"; - ts_free(string); - return stream; - } -} - -bool operator==(const TSNode &left, const TSNode &right) { - return - left.id == right.id && - ts_node_start_byte(left) == ts_node_start_byte(right) && - ts_node_start_point(left) == ts_node_start_point(right); -} - -bool operator==(const vector &vec, const SubtreeArray &array) { - return - vec.size() == array.size && - std::memcmp(vec.data(), array.contents, array.size * sizeof(Subtree)) == 0; -} - -void assert_consistent_tree_sizes(TSNode node, const vector &line_starts) { - uint32_t child_count = ts_node_child_count(node); - uint32_t named_child_count = ts_node_named_child_count(node); - uint32_t start_byte = ts_node_start_byte(node); - uint32_t end_byte = ts_node_end_byte(node); - TSPoint start_point = ts_node_start_point(node); - TSPoint end_point = ts_node_end_point(node); - - AssertThat(start_byte, !IsGreaterThan(end_byte)); - AssertThat(start_point, !IsGreaterThan(end_point)); - - AssertThat(start_byte, Equals(line_starts[start_point.row] + start_point.column)); - AssertThat(end_byte, Equals(line_starts[end_point.row] + end_point.column)); - - size_t last_child_end_byte = start_byte; - TSPoint last_child_end_point = start_point; - - bool some_child_has_changes = false; - size_t actual_named_child_count = 0; - for (size_t i = 0; i < child_count; i++) { - TSNode child = ts_node_child(node, i); - uint32_t child_start_byte = ts_node_start_byte(child); - TSPoint child_start_point = ts_node_start_point(child); - - AssertThat(child_start_byte, !IsLessThan(last_child_end_byte)); - AssertThat(child_start_point, !IsLessThan(last_child_end_point)); - assert_consistent_tree_sizes(child, line_starts); - - if (ts_node_has_changes(child)) some_child_has_changes = true; - if (ts_node_is_named(child)) actual_named_child_count++; - - last_child_end_byte = ts_node_end_byte(child); - last_child_end_point = ts_node_end_point(child); - } - - AssertThat(actual_named_child_count, Equals(named_child_count)); - - if (child_count > 0) { - AssertThat(end_byte, !IsLessThan(last_child_end_byte)); - AssertThat(end_point, !IsLessThan(last_child_end_point)); - } - - if (some_child_has_changes) { - AssertThat(ts_node_has_changes(node), IsTrue()); - } -} - -void assert_consistent_tree_sizes(const TSTree *tree, const string &text) { - vector line_starts; - line_starts.push_back(0); - for (uint32_t i = 0, n = text.size(); i < n; i++) { - if (text[i] == '\n') { - line_starts.push_back(i + 1); - } - } - - TSNode root_node = ts_tree_root_node(tree); - AssertThat(ts_node_end_byte(root_node), Equals(text.size())); - assert_consistent_tree_sizes(root_node, line_starts); -} - -string to_string(const TSTree *tree) { - const char *c_string = ts_node_string(ts_tree_root_node(tree)); - string result(c_string); - ts_free((void *)c_string); - return result; -} diff --git a/test/helpers/tree_helpers.h b/test/helpers/tree_helpers.h deleted file mode 100644 index 21b1d7f2..00000000 --- a/test/helpers/tree_helpers.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef HELPERS_TREE_HELPERS_H_ -#define HELPERS_TREE_HELPERS_H_ - -#include "runtime/subtree.h" -#include -#include - -extern const char *symbol_names[24]; -SubtreeArray *tree_array(std::vector trees); - -std::ostream &operator<<(std::ostream &stream, Subtree tree); -std::ostream &operator<<(std::ostream &stream, const TSNode &node); -bool operator==(const TSNode &left, const TSNode &right); -bool operator==(const std::vector &right, const SubtreeArray &array); -std::string to_string(const TSTree *); - -void assert_consistent_tree_sizes(const TSTree *, const std::string &); - -#endif // HELPERS_TREE_HELPERS_H_ diff --git a/test/integration/fuzzing-examples.cc b/test/integration/fuzzing-examples.cc deleted file mode 100644 index 555b4274..00000000 --- a/test/integration/fuzzing-examples.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include "test_helper.h" -#include "base64.c" -#include "helpers/load_language.h" -#include "helpers/tree_helpers.h" -#include "helpers/record_alloc.h" - -START_TEST - -vector> examples({ - { - "javascript", - "Bi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLXGK0i0vLS0tLS0tLS0tLS0tLS0tLS0tLS0tLXGK0i0vLS0tLS0tLS0tLS0tLS0xLS0tLTYtLfpZAA==" - }, - { - "python", - "NWNvbogsKTMsLCwsY29uiCwqLDo1Y29uLA==" - }, -}); - -describe("examples found via fuzzing", [&]() { - before_each([&]() { - record_alloc::start(); - }); - - after_each([&]() { - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - for (unsigned i = 0, n = examples.size(); i < n; i++) { - - it(("parses example number " + to_string(i)).c_str(), [&]() { - TSParser *parser = ts_parser_new(); - - if (getenv("TREE_SITTER_ENABLE_DEBUG_GRAPHS")) { - ts_parser_print_dot_graphs(parser, stderr); - } - - const string &language_name = examples[i].first; - ts_parser_set_language(parser, load_real_language(language_name)); - - string input; - const string &base64_input = examples[i].second; - input.resize(base64_input.size()); - input.resize(base64_decode( - reinterpret_cast(base64_input.c_str()), - reinterpret_cast(&input[0]), - base64_input.size() - )); - - TSTree *tree = ts_parser_parse_string(parser, nullptr, input.c_str(), input.size()); - assert_consistent_tree_sizes(tree, input); - - ts_tree_delete(tree); - ts_parser_delete(parser); - }); - - } -}); - -END_TEST diff --git a/test/integration/real_grammars.cc b/test/integration/real_grammars.cc deleted file mode 100644 index 8d0807ef..00000000 --- a/test/integration/real_grammars.cc +++ /dev/null @@ -1,178 +0,0 @@ -#include "test_helper.h" -#include "runtime/alloc.h" -#include "helpers/load_language.h" -#include "helpers/read_test_entries.h" -#include "helpers/spy_input.h" -#include "helpers/stderr_logger.h" -#include "helpers/point_helpers.h" -#include "helpers/record_alloc.h" -#include "helpers/random_helpers.h" -#include "helpers/scope_sequence.h" -#include "helpers/tree_helpers.h" -#include - -TSInputEdit do_random_edit(Generator &random, SpyInput &input, TSTree *tree) { - size_t choice = random(10); - - if (choice < 2) { - // Insert text at end - string inserted_text = random.words(1); - return input.replace(input.content.size(), 0, inserted_text); - } else if (choice < 5) { - // Delete text from end - size_t deletion_size = random(10); - if (deletion_size > input.content.size()) deletion_size = input.content.size(); - return input.replace(input.content.size() - deletion_size, deletion_size, ""); - } else if (choice < 8) { - // Insert at random position - size_t position = random(input.content.size() + 1); - string inserted_text = random.words(1 + random(3)); - return input.replace(position, 0, inserted_text); - } else { - // Replace at random position - size_t position = random(input.content.size() + 1); - size_t deletion_size = random(input.content.size() + 1 - position); - string inserted_text = random.words(1 + random(4)); - return input.replace(position, deletion_size, inserted_text); - } -} - -START_TEST; - -if (TREE_SITTER_SEED == -1) return; - -vector test_languages({ - "embedded-template", - "javascript", - "json", - "html", - "c", - "cpp", - "python", - "bash", -}); - -for (auto &language_name : test_languages) { - describe("the " + language_name + " language", [&]() { - TSParser *parser; - const bool debug_graphs_enabled = getenv("TREE_SITTER_ENABLE_DEBUG_GRAPHS"); - Generator random(0); - - before_each([&]() { - record_alloc::start(); - parser = ts_parser_new(); - ts_parser_set_language(parser, load_real_language(language_name)); - - // ts_parser_set_logger(parser, stderr_logger_new(true)); - if (debug_graphs_enabled) { - ts_parser_print_dot_graphs(parser, stderr); - } - }); - - after_each([&]() { - ts_parser_delete(parser); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - for (auto &entry : read_real_language_corpus(language_name)) { - it("parses " + entry.description + ": initial parse", [&]() { - SpyInput input(entry.input, 4); - if (debug_graphs_enabled) printf("%s\n\n", input.content.c_str()); - - TSTree *tree = ts_parser_parse(parser, nullptr, input.input()); - assert_consistent_tree_sizes(tree, input.content); - - TSNode root_node = ts_tree_root_node(tree); - const char *node_string = ts_node_string(root_node); - string result(node_string); - ts_free((void *)node_string); - AssertThat(result, Equals(entry.tree_string)); - - ts_tree_delete(tree); - }); - - for (unsigned i = 0; i < 20; i++) { - unsigned int seed = TREE_SITTER_SEED + i; - - it("parses " + entry.description + ": " + "edit sequence " + to_string(seed), [&]() { - random.reseed(seed); - SpyInput input(entry.input, 3); - unsigned edit_count = 1 + random(4); - - // Parse the input from the corpus. - if (debug_graphs_enabled) printf("\n%s\n", input.content.c_str()); - TSTree *tree = ts_parser_parse(parser, nullptr, input.input()); - - // Perform a random series of edits. - for (unsigned j = 0; j < edit_count; j++) { - TSInputEdit edit = do_random_edit(random, input, tree); - ts_tree_edit(tree, &edit); - if (debug_graphs_enabled) { - ts_tree_print_dot_graph(tree, stderr); - printf( - "edit: %u - %u, %u - %u\n%s\n", - edit.start_byte, edit.old_end_byte, - edit.start_byte, edit.new_end_byte, - input.content.c_str() - ); - } - } - - // Reparse the edited code incrementally. - TSTree *new_tree = ts_parser_parse(parser, tree, input.input()); - assert_consistent_tree_sizes(new_tree, input.content); - - // Verify that the correct ranges have been marked as changed. - uint32_t range_count; - TSRange *ranges = ts_tree_get_changed_ranges(tree, new_tree, &range_count); - ScopeSequence old_scope_sequence = build_scope_sequence(tree, input.content); - ScopeSequence new_scope_sequence = build_scope_sequence(new_tree, input.content); - verify_changed_ranges( - old_scope_sequence, new_scope_sequence, - input.content, ranges, range_count - ); - ts_free(ranges); - ts_tree_delete(tree); - tree = new_tree; - - // Undo the random edits. - while (input.can_undo()) { - TSInputEdit edit = input.undo(); - ts_tree_edit(new_tree, &edit); - if (debug_graphs_enabled) { - ts_tree_print_dot_graph(tree, stderr); - printf( - "edit: %u - %u, %u - %u\n%s\n", - edit.start_byte, edit.old_end_byte, - edit.start_byte, edit.new_end_byte, - input.content.c_str() - ); - } - } - - // Reparse the restored code incrementally. - new_tree = ts_parser_parse(parser, tree, input.input()); - assert_consistent_tree_sizes(new_tree, input.content); - - // Verify that the correct ranges have been marked as changed. - ranges = ts_tree_get_changed_ranges(tree, new_tree, &range_count); - old_scope_sequence = build_scope_sequence(tree, input.content); - new_scope_sequence = build_scope_sequence(new_tree, input.content); - verify_changed_ranges( - old_scope_sequence, new_scope_sequence, - input.content, ranges, range_count - ); - ts_free(ranges); - ts_tree_delete(tree); - tree = new_tree; - - // Verify that the final tree matches the expectation from the corpus. - AssertThat(to_string(tree), Equals(entry.tree_string)); - ts_tree_delete(tree); - }); - } - } - }); -} - -END_TEST; diff --git a/test/integration/test_grammars.cc b/test/integration/test_grammars.cc deleted file mode 100644 index 7d3b6972..00000000 --- a/test/integration/test_grammars.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "test_helper.h" -#include "helpers/read_test_entries.h" -#include "helpers/load_language.h" -#include "helpers/stderr_logger.h" -#include "helpers/file_helpers.h" -#include "helpers/tree_helpers.h" -#include "runtime/alloc.h" -#include "helpers/record_alloc.h" - -START_TEST - -string grammars_dir_path = join_path({"test", "fixtures", "test_grammars"}); -vector test_languages = list_directory(grammars_dir_path); - -for (auto &language_name : test_languages) { - if (language_name == "readme.md") continue; - - describe(("test grammar: " + language_name).c_str(), [&]() { - string directory_path = join_path({grammars_dir_path, language_name}); - string grammar_path = join_path({directory_path, "grammar.json"}); - string expected_error_path = join_path({directory_path, "expected_error.txt"}); - - string grammar_json = read_file(grammar_path); - const TSLanguage *language = nullptr; - - if (file_exists(expected_error_path)) { - it("fails with the correct error message", [&]() { - TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str(), nullptr); - string expected_error = read_file(expected_error_path); - AssertThat((void *)compile_result.error_message, !Equals(nullptr)); - AssertThat(compile_result.error_message, Equals(expected_error)); - }); - return; - } - - for (auto &entry : read_test_language_corpus(language_name)) { - it(("parses " + entry.description).c_str(), [&]() { - record_alloc::start(); - - if (!language) { - string external_scanner_path = join_path({directory_path, "scanner.c"}); - if (!file_exists(external_scanner_path)) external_scanner_path = ""; - - TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str(), nullptr); - - language = load_test_language( - language_name, - compile_result, - external_scanner_path - ); - } - - TSParser *parser = ts_parser_new(); - ts_parser_set_language(parser, language); - - if (getenv("TREE_SITTER_ENABLE_DEBUG_GRAPHS")) { - ts_parser_print_dot_graphs(parser, stderr); - } - - TSTree *tree = ts_parser_parse_string(parser, nullptr, entry.input.c_str(), entry.input.size()); - assert_consistent_tree_sizes(tree, entry.input); - - TSNode root_node = ts_tree_root_node(tree); - const char *node_string = ts_node_string(root_node); - string result(node_string); - ts_free((void *)node_string); - AssertThat(result, Equals(entry.tree_string)); - - ts_tree_delete(tree); - ts_parser_delete(parser); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - } - }); -} - -END_TEST diff --git a/test/runtime/language_test.cc b/test/runtime/language_test.cc deleted file mode 100644 index 30861ee7..00000000 --- a/test/runtime/language_test.cc +++ /dev/null @@ -1,85 +0,0 @@ -#include "test_helper.h" -#include "runtime/alloc.h" -#include "helpers/load_language.h" - -START_TEST - -describe("Language", []() { - describe("symbol_name(TSSymbol symbol)", [&]() { - it("returns the correct name for aliased nodes", [&]() { - TSCompileResult compile_result = ts_compile_grammar(R"JSON({ - "name": "aliased_rules", - - "rules": { - "a": { - "type": "ALIAS", - "value": "c", - "named": true, - "content": { - "type": "SYMBOL", - "name": "b" - } - }, - - "b": { - "type": "STRING", - "value": "b" - } - } - })JSON", nullptr); - - TSParser *parser = ts_parser_new(); - const TSLanguage *language = load_test_language("aliased_rules", compile_result); - ts_parser_set_language(parser, language); - TSTree *tree = ts_parser_parse_string(parser, nullptr, "b", 1); - - TSNode root_node = ts_tree_root_node(tree); - char *string = ts_node_string(root_node); - AssertThat(string, Equals("(a (c))")); - - TSNode aliased_node = ts_node_child(root_node, 0); - AssertThat(ts_node_type(aliased_node), Equals("c")); - - TSSymbol aliased_symbol = ts_node_symbol(aliased_node); - AssertThat(ts_language_symbol_count(language), IsGreaterThan(aliased_symbol)); - AssertThat(ts_language_symbol_name(language, aliased_symbol), Equals("c")); - AssertThat(ts_language_symbol_type(language, aliased_symbol), Equals(TSSymbolTypeRegular)); - - ts_free(string); - ts_parser_delete(parser); - ts_tree_delete(tree); - }); - }); - - describe("symbol_for_name(name)", [&]() { - it("returns the symbol for the given name", [&]() { - const TSLanguage *language = load_real_language("javascript"); - AssertThat( - ts_language_symbol_name( - language, - ts_language_symbol_for_name(language, "if_statement") - ), - Equals("if_statement") - ); - AssertThat( - ts_language_symbol_name( - language, - ts_language_symbol_for_name(language, "ERROR") - ), - Equals("ERROR") - ); - AssertThat(ts_language_symbol_for_name(language, "non_existent_symbol"), Equals(0u)); - - TSSymbol last = ts_language_symbol_count(language)-1; - AssertThat( - ts_language_symbol_for_name( - language, - ts_language_symbol_name(language, last) - ), - Equals(last) - ); - }); - }); -}); - -END_TEST diff --git a/test/runtime/node_test.cc b/test/runtime/node_test.cc deleted file mode 100644 index 0deb311c..00000000 --- a/test/runtime/node_test.cc +++ /dev/null @@ -1,962 +0,0 @@ -#include "test_helper.h" -#include "runtime/alloc.h" -#include "helpers/tree_helpers.h" -#include "helpers/point_helpers.h" -#include "helpers/load_language.h" -#include "helpers/record_alloc.h" -#include "helpers/stream_methods.h" -#include "helpers/random_helpers.h" -#include "helpers/spy_input.h" - -START_TEST - -string json_string = R"JSON( - -[ - 123, - false, - { - "x": null - } -] -)JSON"; - -size_t array_index = json_string.find("[\n"); -size_t array_end_index = json_string.find("]") + 1; -size_t number_index = json_string.find("123"); -size_t number_end_index = number_index + string("123").size(); -size_t false_index = json_string.find("false"); -size_t false_end_index = false_index + string("false").size(); -size_t object_index = json_string.find("{"); -size_t object_end_index = json_string.find("}") + 1; -size_t string_index = json_string.find("\"x\""); -size_t string_end_index = string_index + 3; -size_t colon_index = json_string.find(":"); -size_t null_index = json_string.find("null"); -size_t null_end_index = null_index + string("null").size(); - -string grammar_with_aliases_and_extras = R"JSON({ - "name": "aliases_and_extras", - - "extras": [ - {"type": "PATTERN", "value": "\\s+"}, - {"type": "SYMBOL", "name": "comment"}, - ], - - "rules": { - "a": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "b"}, - { - "type": "ALIAS", - "value": "B", - "named": true, - "content": {"type": "SYMBOL", "name": "b"} - }, - { - "type": "ALIAS", - "value": "C", - "named": true, - "content": {"type": "SYMBOL", "name": "_c"} - } - ] - }, - - "b": {"type": "STRING", "value": "b"}, - - "_c": {"type": "STRING", "value": "c"}, - - "comment": {"type": "STRING", "value": "..."} - } -})JSON"; - -const TSLanguage *language_with_aliases_and_extras = load_test_language( - "aliases_and_extras", - ts_compile_grammar(grammar_with_aliases_and_extras.c_str(), nullptr) -); - -describe("Node", [&]() { - TSParser *parser; - TSTree *tree; - TSNode root_node; - TSNode NULL_NODE = {}; - - before_each([&]() { - record_alloc::start(); - - parser = ts_parser_new(); - ts_parser_set_language(parser, load_real_language("json")); - tree = ts_parser_parse_string(parser, nullptr, json_string.c_str(), json_string.size()); - root_node = ts_node_child(ts_tree_root_node(tree), 0); - }); - - after_each([&]() { - ts_parser_delete(parser); - ts_tree_delete(tree); - - record_alloc::stop(); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - it("parses the example as expected (precondition)", [&]() { - char *node_string = ts_node_string(root_node); - AssertThat(node_string, Equals( - "(array " - "(number) " - "(false) " - "(object (pair (string) (null))))")); - ts_free(node_string); - }); - - describe("named_child_count(), named_child(i)", [&]() { - it("returns the named child node at the given index", [&]() { - AssertThat(ts_node_type(root_node), Equals("array")); - - AssertThat(ts_node_named_child_count(root_node), Equals(3)); - AssertThat(ts_node_start_byte(root_node), Equals(array_index)); - AssertThat(ts_node_end_byte(root_node), Equals(array_end_index)); - AssertThat(ts_node_start_point(root_node), Equals({ 2, 0 })); - AssertThat(ts_node_end_point(root_node), Equals({ 8, 1 })); - - TSNode number_node = ts_node_named_child(root_node, 0); - TSNode false_node = ts_node_named_child(root_node, 1); - TSNode object_node = ts_node_named_child(root_node, 2); - - AssertThat(ts_node_type(number_node), Equals("number")); - AssertThat(ts_node_type(false_node), Equals("false")); - AssertThat(ts_node_type(object_node), Equals("object")); - - AssertThat(ts_node_start_byte(number_node), Equals(number_index)); - AssertThat(ts_node_end_byte(number_node), Equals(number_end_index)); - AssertThat(ts_node_start_point(number_node), Equals({ 3, 2 })); - AssertThat(ts_node_end_point(number_node), Equals({ 3, 5 })); - - AssertThat(ts_node_start_byte(false_node), Equals(false_index)); - AssertThat(ts_node_end_byte(false_node), Equals(false_end_index)); - AssertThat(ts_node_start_point(false_node), Equals({ 4, 2 })); - AssertThat(ts_node_end_point(false_node), Equals({ 4, 7 })); - - AssertThat(ts_node_start_byte(object_node), Equals(object_index)); - AssertThat(ts_node_end_byte(object_node), Equals(object_end_index)); - AssertThat(ts_node_start_point(object_node), Equals({ 5, 2 })); - AssertThat(ts_node_end_point(object_node), Equals({ 7, 3 })); - AssertThat(ts_node_named_child_count(object_node), Equals(1)); - - TSNode pair_node = ts_node_named_child(object_node, 0); - - AssertThat(ts_node_type(pair_node), Equals("pair")); - AssertThat(ts_node_start_byte(pair_node), Equals(string_index)); - AssertThat(ts_node_end_byte(pair_node), Equals(null_end_index)); - AssertThat(ts_node_start_point(pair_node), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(pair_node), Equals({ 6, 13 })); - AssertThat(ts_node_named_child_count(pair_node), Equals(2)); - - TSNode string_node = ts_node_named_child(pair_node, 0); - TSNode null_node = ts_node_named_child(pair_node, 1); - - AssertThat(ts_node_type(string_node), Equals("string")); - AssertThat(ts_node_type(null_node), Equals("null")); - - AssertThat(ts_node_start_byte(string_node), Equals(string_index)); - AssertThat(ts_node_end_byte(string_node), Equals(string_end_index)); - AssertThat(ts_node_start_point(string_node), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(string_node), Equals({ 6, 7 })); - - AssertThat(ts_node_start_byte(null_node), Equals(null_index)); - AssertThat(ts_node_end_byte(null_node), Equals(null_end_index)); - AssertThat(ts_node_start_point(null_node), Equals({ 6, 9 })); - AssertThat(ts_node_end_point(null_node), Equals({ 6, 13 })); - - AssertThat(ts_node_parent(string_node), Equals(pair_node)); - AssertThat(ts_node_parent(null_node), Equals(pair_node)); - AssertThat(ts_node_parent(pair_node), Equals(object_node)); - AssertThat(ts_node_parent(number_node), Equals(root_node)); - AssertThat(ts_node_parent(false_node), Equals(root_node)); - AssertThat(ts_node_parent(object_node), Equals(root_node)); - AssertThat(ts_node_parent(ts_tree_root_node(tree)), Equals(NULL_NODE)); - }); - - it("works correctly when the node contains aliased children and extras", [&]() { - ts_parser_set_language(parser, language_with_aliases_and_extras); - ts_tree_delete(tree); - tree = ts_parser_parse_string(parser, nullptr, "b ... b ... c", 13); - root_node = ts_tree_root_node(tree); - - char *node_string = ts_node_string(root_node); - AssertThat(node_string, Equals("(a (b) (comment) (B) (comment) (C))")); - ts_free(node_string); - - AssertThat(ts_node_named_child_count(root_node), Equals(5u)); - AssertThat(ts_node_type(ts_node_named_child(root_node, 0)), Equals("b")); - AssertThat(ts_node_type(ts_node_named_child(root_node, 1)), Equals("comment")); - AssertThat(ts_node_type(ts_node_named_child(root_node, 2)), Equals("B")); - AssertThat(ts_node_type(ts_node_named_child(root_node, 3)), Equals("comment")); - AssertThat(ts_node_type(ts_node_named_child(root_node, 4)), Equals("C")); - - AssertThat( - ts_node_symbol(ts_node_named_child(root_node, 0)), - !Equals(ts_node_symbol(ts_node_named_child(root_node, 2))) - ); - }); - }); - - describe("first_child_for_byte(byte_offset)", [&]() { - it("returns the first child that extends beyond the given byte offset", [&]() { - TSNode child; - - child = ts_node_first_child_for_byte(root_node, array_index); - AssertThat(ts_node_type(child), Equals("[")); - child = ts_node_first_child_for_byte(root_node, number_index); - AssertThat(ts_node_type(child), Equals("number")); - child = ts_node_first_child_for_byte(root_node, number_end_index); - AssertThat(ts_node_type(child), Equals(",")); - child = ts_node_first_child_for_byte(root_node, number_end_index + 1); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_child_for_byte(root_node, false_index - 1); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_child_for_byte(root_node, false_index); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_child_for_byte(root_node, false_index + 1); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_child_for_byte(root_node, false_end_index); - AssertThat(ts_node_type(child), Equals(",")); - child = ts_node_first_child_for_byte(root_node, false_end_index); - AssertThat(ts_node_type(child), Equals(",")); - child = ts_node_first_child_for_byte(root_node, object_index); - AssertThat(ts_node_type(child), Equals("object")); - child = ts_node_first_child_for_byte(root_node, object_index + 1); - AssertThat(ts_node_type(child), Equals("object")); - child = ts_node_first_child_for_byte(root_node, object_end_index); - AssertThat(ts_node_type(child), Equals("]")); - }); - }); - - describe("first_named_child_for_byte(byte_offset)", [&]() { - it("returns the first named child that extends beyond the given byte offset", [&]() { - TSNode child; - - child = ts_node_first_named_child_for_byte(root_node, array_index); - AssertThat(ts_node_type(child), Equals("number")); - child = ts_node_first_named_child_for_byte(root_node, number_index); - AssertThat(ts_node_type(child), Equals("number")); - child = ts_node_first_named_child_for_byte(root_node, number_end_index); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_named_child_for_byte(root_node, number_end_index + 1); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_named_child_for_byte(root_node, false_index - 1); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_named_child_for_byte(root_node, false_index); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_named_child_for_byte(root_node, false_index + 1); - AssertThat(ts_node_type(child), Equals("false")); - child = ts_node_first_named_child_for_byte(root_node, false_end_index); - AssertThat(ts_node_type(child), Equals("object")); - child = ts_node_first_named_child_for_byte(root_node, object_index); - AssertThat(ts_node_type(child), Equals("object")); - child = ts_node_first_named_child_for_byte(root_node, object_index + 1); - AssertThat(ts_node_type(child), Equals("object")); - child = ts_node_first_named_child_for_byte(root_node, object_end_index); - AssertThat(child, Equals(NULL_NODE)); - }); - }); - - describe("child_count(), child(i)", [&]() { - it("returns the child node at the given index, including anonymous nodes", [&]() { - AssertThat(ts_node_child_count(root_node), Equals(7)); - TSNode child1 = ts_node_child(root_node, 0); - TSNode child2 = ts_node_child(root_node, 1); - TSNode child3 = ts_node_child(root_node, 2); - TSNode child4 = ts_node_child(root_node, 3); - TSNode child5 = ts_node_child(root_node, 4); - TSNode child6 = ts_node_child(root_node, 5); - TSNode child7 = ts_node_child(root_node, 6); - - AssertThat(ts_node_type(root_node), Equals("array")); - AssertThat(ts_node_type(child1), Equals("[")); - AssertThat(ts_node_type(child2), Equals("number")); - AssertThat(ts_node_type(child3), Equals(",")); - AssertThat(ts_node_type(child4), Equals("false")); - AssertThat(ts_node_type(child5), Equals(",")); - AssertThat(ts_node_type(child6), Equals("object")); - AssertThat(ts_node_type(child7), Equals("]")); - - AssertThat(ts_node_is_named(root_node), IsTrue()); - AssertThat(ts_node_is_named(child1), IsFalse()); - AssertThat(ts_node_is_named(child2), IsTrue()); - AssertThat(ts_node_is_named(child3), IsFalse()); - AssertThat(ts_node_is_named(child4), IsTrue()); - AssertThat(ts_node_is_named(child5), IsFalse()); - AssertThat(ts_node_is_named(child6), IsTrue()); - AssertThat(ts_node_is_named(child7), IsFalse()); - - AssertThat(ts_node_start_byte(child1), Equals(array_index)); - AssertThat(ts_node_end_byte(child1), Equals(array_index + 1)); - AssertThat(ts_node_start_point(child1), Equals({ 2, 0 })); - AssertThat(ts_node_end_point(child1), Equals({ 2, 1 })); - - AssertThat(ts_node_start_byte(child3), Equals(number_end_index)); - AssertThat(ts_node_end_byte(child3), Equals(number_end_index + 1)); - AssertThat(ts_node_start_point(child3), Equals({ 3, 5 })); - AssertThat(ts_node_end_point(child3), Equals({ 3, 6 })); - - AssertThat(ts_node_start_byte(child5), Equals(false_end_index)); - AssertThat(ts_node_end_byte(child5), Equals(false_end_index + 1)); - AssertThat(ts_node_start_point(child5), Equals({ 4, 7 })); - AssertThat(ts_node_end_point(child5), Equals({ 4, 8 })); - - AssertThat(ts_node_start_byte(child7), Equals(array_end_index - 1)); - AssertThat(ts_node_end_byte(child7), Equals(array_end_index)); - AssertThat(ts_node_start_point(child7), Equals({ 8, 0 })); - AssertThat(ts_node_end_point(child7), Equals({ 8, 1 })); - - AssertThat(ts_node_child_count(child6), Equals(3)); - - TSNode left_brace = ts_node_child(child6, 0); - TSNode pair = ts_node_child(child6, 1); - TSNode right_brace = ts_node_child(child6, 2); - - TSNode grandchild2 = ts_node_child(pair, 0); - TSNode grandchild3 = ts_node_child(pair, 1); - TSNode grandchild4 = ts_node_child(pair, 2); - - AssertThat(ts_node_type(left_brace), Equals("{")); - AssertThat(ts_node_type(pair), Equals("pair")); - AssertThat(ts_node_type(right_brace), Equals("}")); - - AssertThat(ts_node_type(grandchild2), Equals("string")); - AssertThat(ts_node_type(grandchild3), Equals(":")); - AssertThat(ts_node_type(grandchild4), Equals("null")); - - AssertThat(ts_node_parent(grandchild2), Equals(pair)); - AssertThat(ts_node_parent(grandchild3), Equals(pair)); - AssertThat(ts_node_parent(grandchild4), Equals(pair)); - AssertThat(ts_node_parent(left_brace), Equals(child6)); - AssertThat(ts_node_parent(pair), Equals(child6)); - AssertThat(ts_node_parent(right_brace), Equals(child6)); - AssertThat(ts_node_parent(child1), Equals(root_node)); - AssertThat(ts_node_parent(child2), Equals(root_node)); - AssertThat(ts_node_parent(child3), Equals(root_node)); - AssertThat(ts_node_parent(child4), Equals(root_node)); - AssertThat(ts_node_parent(child5), Equals(root_node)); - AssertThat(ts_node_parent(child6), Equals(root_node)); - AssertThat(ts_node_parent(child7), Equals(root_node)); - AssertThat(ts_node_parent(ts_tree_root_node(tree)), Equals(NULL_NODE)); - }); - }); - - describe("next_sibling(), prev_sibling()", [&]() { - it("returns the node's next and previous sibling, including anonymous nodes", [&]() { - TSNode bracket_node1 = ts_node_child(root_node, 0); - TSNode number_node = ts_node_child(root_node, 1); - TSNode array_comma_node1 = ts_node_child(root_node, 2); - TSNode false_node = ts_node_child(root_node, 3); - TSNode array_comma_node2 = ts_node_child(root_node, 4); - TSNode object_node = ts_node_child(root_node, 5); - TSNode brace_node1 = ts_node_child(object_node, 0); - TSNode pair_node = ts_node_child(object_node, 1); - TSNode string_node = ts_node_child(pair_node, 0); - TSNode colon_node = ts_node_child(pair_node, 1); - TSNode null_node = ts_node_child(pair_node, 2); - TSNode brace_node2 = ts_node_child(object_node, 2); - TSNode bracket_node2 = ts_node_child(root_node, 6); - - AssertThat(ts_node_parent(bracket_node1), Equals(root_node)); - AssertThat(ts_node_next_sibling(bracket_node1), Equals(number_node)); - AssertThat(ts_node_next_sibling(number_node), Equals(array_comma_node1)); - AssertThat(ts_node_next_sibling(array_comma_node1), Equals(false_node)); - AssertThat(ts_node_next_sibling(false_node), Equals(array_comma_node2)); - AssertThat(ts_node_next_sibling(array_comma_node2), Equals(object_node)); - AssertThat(ts_node_next_sibling(object_node), Equals(bracket_node2)); - AssertThat(ts_node_next_sibling(bracket_node2), Equals(NULL_NODE)); - - AssertThat(ts_node_prev_sibling(bracket_node1), Equals(NULL_NODE)); - AssertThat(ts_node_prev_sibling(number_node), Equals(bracket_node1)); - AssertThat(ts_node_prev_sibling(array_comma_node1), Equals(number_node)); - AssertThat(ts_node_prev_sibling(false_node), Equals(array_comma_node1)); - AssertThat(ts_node_prev_sibling(array_comma_node2), Equals(false_node)); - AssertThat(ts_node_prev_sibling(object_node), Equals(array_comma_node2)); - AssertThat(ts_node_prev_sibling(bracket_node2), Equals(object_node)); - - AssertThat(ts_node_next_sibling(brace_node1), Equals(pair_node)); - AssertThat(ts_node_next_sibling(pair_node), Equals(brace_node2)); - AssertThat(ts_node_next_sibling(brace_node2), Equals(NULL_NODE)); - - AssertThat(ts_node_prev_sibling(brace_node1), Equals(NULL_NODE)); - AssertThat(ts_node_prev_sibling(pair_node), Equals(brace_node1)); - AssertThat(ts_node_prev_sibling(brace_node2), Equals(pair_node)); - - AssertThat(ts_node_next_sibling(string_node), Equals(colon_node)); - AssertThat(ts_node_next_sibling(colon_node), Equals(null_node)); - AssertThat(ts_node_next_sibling(null_node), Equals(NULL_NODE)); - - AssertThat(ts_node_prev_sibling(string_node), Equals(NULL_NODE)); - AssertThat(ts_node_prev_sibling(colon_node), Equals(string_node)); - AssertThat(ts_node_prev_sibling(null_node), Equals(colon_node)); - }); - - it("returns null when the node has no parent", [&]() { - AssertThat(ts_node_next_named_sibling(root_node), Equals(NULL_NODE)); - AssertThat(ts_node_prev_named_sibling(root_node), Equals(NULL_NODE)); - }); - - it("works for missing nodes", [&]() { - ts_tree_delete(tree); - - string input_string = "")); - AssertThat(ts_node_is_missing(missing_node), IsTrue()); - - TSNode tag_name_node = ts_node_prev_sibling(missing_node); - AssertThat(ts_node_type(tag_name_node), Equals("tag_name")); - AssertThat(ts_node_next_sibling(tag_name_node), Equals(missing_node)); - }); - }); - - describe("next_named_sibling(), prev_named_sibling()", [&]() { - it("returns the node's next and previous siblings", [&]() { - TSNode number_node = ts_node_named_child(root_node, 0); - TSNode false_node = ts_node_named_child(root_node, 1); - TSNode object_node = ts_node_named_child(root_node, 2); - TSNode pair_node = ts_node_named_child(object_node, 0); - TSNode string_node = ts_node_named_child(pair_node, 0); - TSNode null_node = ts_node_named_child(pair_node, 1); - - AssertThat(ts_node_next_named_sibling(number_node), Equals(false_node)); - AssertThat(ts_node_next_named_sibling(false_node), Equals(object_node)); - AssertThat(ts_node_next_named_sibling(string_node), Equals(null_node)); - AssertThat(ts_node_prev_named_sibling(object_node), Equals(false_node)); - AssertThat(ts_node_prev_named_sibling(false_node), Equals(number_node)); - AssertThat(ts_node_prev_named_sibling(null_node), Equals(string_node)); - }); - - it("returns null when the node has no parent", [&]() { - AssertThat(ts_node_next_named_sibling(root_node), Equals(NULL_NODE)); - AssertThat(ts_node_prev_named_sibling(root_node), Equals(NULL_NODE)); - }); - }); - - describe("named_descendant_for_byte_range(start, end)", [&]() { - describe("when there is a leaf node that spans the given range exactly", [&]() { - it("returns that leaf node", [&]() { - TSNode leaf = ts_node_named_descendant_for_byte_range(root_node, string_index, string_end_index - 1); - AssertThat(ts_node_type(leaf), Equals("string")); - AssertThat(ts_node_start_byte(leaf), Equals(string_index)); - AssertThat(ts_node_end_byte(leaf), Equals(string_end_index)); - AssertThat(ts_node_start_point(leaf), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(leaf), Equals({ 6, 7 })); - - leaf = ts_node_named_descendant_for_byte_range(root_node, number_index, number_end_index - 1); - AssertThat(ts_node_type(leaf), Equals("number")); - AssertThat(ts_node_start_byte(leaf), Equals(number_index)); - AssertThat(ts_node_end_byte(leaf), Equals(number_end_index)); - AssertThat(ts_node_start_point(leaf), Equals({ 3, 2 })); - AssertThat(ts_node_end_point(leaf), Equals({ 3, 5 })); - - TSNode parent = ts_node_parent(leaf); - AssertThat(ts_node_type(parent), Equals("array")); - AssertThat(ts_node_start_byte(parent), Equals(array_index)); - parent = ts_node_parent(parent); - AssertThat(ts_node_type(parent), Equals("value")); - AssertThat(ts_node_start_byte(parent), Equals(array_index)); - }); - }); - - describe("when there is a leaf node that extends beyond the given range", [&]() { - it("returns that leaf node", [&]() { - TSNode leaf = ts_node_named_descendant_for_byte_range(root_node, string_index, string_index + 1); - AssertThat(ts_node_type(leaf), Equals("string")); - AssertThat(ts_node_start_byte(leaf), Equals(string_index)); - AssertThat(ts_node_end_byte(leaf), Equals(string_end_index)); - AssertThat(ts_node_start_point(leaf), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(leaf), Equals({ 6, 7 })); - - leaf = ts_node_named_descendant_for_byte_range(root_node, string_index + 1, string_index + 2); - AssertThat(ts_node_type(leaf), Equals("string")); - AssertThat(ts_node_start_byte(leaf), Equals(string_index)); - AssertThat(ts_node_end_byte(leaf), Equals(string_end_index)); - AssertThat(ts_node_start_point(leaf), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(leaf), Equals({ 6, 7 })); - }); - }); - - describe("when there is no leaf node that spans the given range", [&]() { - it("returns the smallest node that does span the range", [&]() { - TSNode pair_node = ts_node_named_descendant_for_byte_range(root_node, string_index, string_index + 3); - AssertThat(ts_node_type(pair_node), Equals("pair")); - AssertThat(ts_node_start_byte(pair_node), Equals(string_index)); - AssertThat(ts_node_end_byte(pair_node), Equals(null_end_index)); - AssertThat(ts_node_start_point(pair_node), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(pair_node), Equals({ 6, 13 })); - }); - - it("does not return invisible nodes (repeats)", [&]() { - TSNode node = ts_node_named_descendant_for_byte_range(root_node, number_end_index, number_end_index + 1); - AssertThat(ts_node_type(node), Equals("array")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - AssertThat(ts_node_end_byte(node), Equals(array_end_index)); - AssertThat(ts_node_start_point(node), Equals({ 2, 0 })); - AssertThat(ts_node_end_point(node), Equals({ 8, 1 })); - }); - }); - }); - - describe("descendant_for_byte_range(start, end)", [&]() { - it("returns the smallest node that spans the given byte offsets", [&]() { - TSNode node1 = ts_node_descendant_for_byte_range(root_node, colon_index, colon_index); - AssertThat(ts_node_type(node1), Equals(":")); - AssertThat(ts_node_start_byte(node1), Equals(colon_index)); - AssertThat(ts_node_end_byte(node1), Equals(colon_index + 1)); - AssertThat(ts_node_start_point(node1), Equals({ 6, 7 })); - AssertThat(ts_node_end_point(node1), Equals({ 6, 8 })); - - TSNode node2 = ts_node_descendant_for_byte_range(root_node, string_index + 2, string_index + 4); - AssertThat(ts_node_type(node2), Equals("pair")); - AssertThat(ts_node_start_byte(node2), Equals(string_index)); - AssertThat(ts_node_end_byte(node2), Equals(null_end_index)); - AssertThat(ts_node_start_point(node2), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(node2), Equals({ 6, 13 })); - - AssertThat(ts_node_parent(node1), Equals(node2)); - }); - - it("works in the presence of multi-byte characters", [&]() { - string input_string = "[\"αβγδ\", \"αβγδ\"]"; - - ts_tree_delete(tree); - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - TSNode root_node = ts_tree_root_node(tree); - - uint32_t comma_position = input_string.find(","); - TSNode node1 = ts_node_descendant_for_byte_range(root_node, comma_position, comma_position); - AssertThat(ts_node_type(node1), Equals(",")); - - TSNode node2 = ts_node_descendant_for_byte_range(root_node, 6, 10); - AssertThat(ts_node_type(node2), Equals("string")); - AssertThat(ts_node_start_byte(node2), Equals(1)); - AssertThat(ts_node_end_byte(node2), Equals(11)); - }); - }); - - describe("descendant_for_point_range(start, end)", [&]() { - it("returns the smallest concrete node that spans the given range", [&]() { - TSNode node1 = ts_node_descendant_for_point_range(root_node, {6, 7}, {6, 7}); - AssertThat(ts_node_type(node1), Equals(":")); - AssertThat(ts_node_start_byte(node1), Equals(colon_index)); - AssertThat(ts_node_end_byte(node1), Equals(colon_index + 1)); - AssertThat(ts_node_start_point(node1), Equals({ 6, 7 })); - AssertThat(ts_node_end_point(node1), Equals({ 6, 8 })); - - TSNode node2 = ts_node_descendant_for_point_range(root_node, {6, 6}, {6, 8}); - AssertThat(ts_node_type(node2), Equals("pair")); - AssertThat(ts_node_start_byte(node2), Equals(string_index)); - AssertThat(ts_node_end_byte(node2), Equals(null_end_index)); - AssertThat(ts_node_start_point(node2), Equals({ 6, 4 })); - AssertThat(ts_node_end_point(node2), Equals({ 6, 13 })); - - AssertThat(ts_node_parent(node1), Equals(node2)); - }); - }); - - describe("parent()", [&]() { - it("works after the tree has been edited (regression)", [&]() { - TSNode false_node = ts_node_named_child(root_node, 1); - - TSInputEdit edit = {0, 0, 5, {0, 0}, {0, 0}, {5, 0}}; - ts_tree_edit(tree, &edit); - - TSNode array_node = ts_node_parent(false_node); - AssertThat(ts_node_start_point(array_node), Equals({7, 0})); - }); - }); - - describe("edit(edit)", [&]() { - vector> test_table = { - { - "insert 5 lines at the beginning", - { - 0, 0, 5, - {0, 0}, {0, 0}, {5, 0} - }, - }, - - { - "delete first 5 lines", - { - 0, (uint32_t)object_index, 0, - {0, 0}, {5, 2}, {0, 0} - }, - }, - - { - "replace entire text", - { - 0, (uint32_t)json_string.size(), 5, - {0, 0}, {9, 0}, {0, 5} - } - } - }; - - auto get_all_nodes = [&]() { - vector result; - bool visited_children = false; - TSTreeCursor cursor = ts_tree_cursor_new(ts_tree_root_node(tree)); - while (true) { - result.push_back(ts_tree_cursor_current_node(&cursor)); - if (!visited_children && ts_tree_cursor_goto_first_child(&cursor)) continue; - if (ts_tree_cursor_goto_next_sibling(&cursor)) { - visited_children = false; - } else if (ts_tree_cursor_goto_parent(&cursor)) { - visited_children = true; - } else { - break; - } - } - ts_tree_cursor_delete(&cursor); - return result; - }; - - for (auto &entry : test_table) { - const string &description = entry.first; - const TSInputEdit &edit = entry.second; - - it(("updates the node's start position according to edit - " + description).c_str(), [&]() { - auto nodes_before = get_all_nodes(); - - ts_tree_edit(tree, &edit); - for (TSNode &node : nodes_before) { - ts_node_edit(&node, &edit); - } - - auto nodes_after = get_all_nodes(); - - for (unsigned i = 0; i < nodes_before.size(); i++) { - TSNode &node_before = nodes_before[i]; - TSNode &node_after = nodes_after[i]; - AssertThat(node_before, Equals(node_after)); - } - }); - } - - it("updates the node's start position according to edit - random edits", [&]() { - SpyInput input(json_string, 3); - Generator random(TREE_SITTER_SEED); - - for (unsigned i = 0; i < 10; i++) { - auto nodes_before = get_all_nodes(); - - size_t edit_start = random(input.content.size()); - size_t deletion_size = random(2) ? 0 : random(input.content.size() - edit_start); - string inserted_text = random.words(random(4) + 1); - - TSInputEdit edit = input.replace(edit_start, deletion_size, inserted_text); - ts_tree_edit(tree, &edit); - for (TSNode &node : nodes_before) { - ts_node_edit(&node, &edit); - } - - auto nodes_after = get_all_nodes(); - - for (unsigned i = 0; i < nodes_before.size(); i++) { - TSNode &node_before = nodes_before[i]; - TSNode &node_after = nodes_after[i]; - AssertThat(node_before, Equals(node_after)); - } - } - }); - }); -}); - -describe("TreeCursor", [&]() { - TSParser *parser; - TSTree *tree; - TSTreeCursor cursor; - - before_each([&]() { - record_alloc::start(); - - parser = ts_parser_new(); - ts_parser_set_language(parser, load_real_language("json")); - tree = ts_parser_parse_string(parser, nullptr, json_string.c_str(), json_string.size()); - cursor = ts_tree_cursor_new(ts_tree_root_node(tree)); - }); - - after_each([&]() { - ts_tree_delete(tree); - ts_tree_cursor_delete(&cursor); - ts_parser_delete(parser); - - record_alloc::stop(); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - it("can walk the tree", [&]() { - TSNode node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("value")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - - AssertThat(ts_tree_cursor_goto_first_child(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("array")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - - AssertThat(ts_tree_cursor_goto_first_child(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("[")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - - // Cannot descend into a node with no children - AssertThat(ts_tree_cursor_goto_first_child(&cursor), IsFalse()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("[")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("number")); - AssertThat(ts_node_start_byte(node), Equals(number_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals(",")); - AssertThat(ts_node_start_byte(node), Equals(number_end_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("false")); - AssertThat(ts_node_start_byte(node), Equals(false_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals(",")); - AssertThat(ts_node_start_byte(node), Equals(false_end_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("object")); - AssertThat(ts_node_start_byte(node), Equals(object_index)); - - AssertThat(ts_tree_cursor_goto_first_child(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("{")); - AssertThat(ts_node_start_byte(node), Equals(object_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("pair")); - AssertThat(ts_node_start_byte(node), Equals(string_index)); - - AssertThat(ts_tree_cursor_goto_first_child(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("string")); - AssertThat(ts_node_start_byte(node), Equals(string_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals(":")); - AssertThat(ts_node_start_byte(node), Equals(string_end_index)); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("null")); - AssertThat(ts_node_start_byte(node), Equals(null_index)); - - // Cannot move beyond a node with no next sibling - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsFalse()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("null")); - AssertThat(ts_node_start_byte(node), Equals(null_index)); - - AssertThat(ts_tree_cursor_goto_parent(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("pair")); - AssertThat(ts_node_start_byte(node), Equals(string_index)); - - AssertThat(ts_tree_cursor_goto_parent(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("object")); - AssertThat(ts_node_start_byte(node), Equals(object_index)); - - AssertThat(ts_tree_cursor_goto_parent(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("array")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - - AssertThat(ts_tree_cursor_goto_parent(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("value")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - - // The root node doesn't have a parent. - AssertThat(ts_tree_cursor_goto_parent(&cursor), IsFalse()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("value")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - }); - - it("can find the first child of a given node which spans the given byte offset", [&]() { - int64_t child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, 1); - TSNode node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("array")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - AssertThat(child_index, Equals(0)); - - child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, array_index); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("[")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - AssertThat(child_index, Equals(0)); - - ts_tree_cursor_goto_parent(&cursor); - child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, array_index + 1); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("number")); - AssertThat(ts_node_start_byte(node), Equals(number_index)); - AssertThat(child_index, Equals(1)); - - ts_tree_cursor_goto_parent(&cursor); - child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, number_index + 1); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("number")); - AssertThat(ts_node_start_byte(node), Equals(number_index)); - AssertThat(child_index, Equals(1)); - - ts_tree_cursor_goto_parent(&cursor); - child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, false_index - 1); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("false")); - AssertThat(ts_node_start_byte(node), Equals(false_index)); - AssertThat(child_index, Equals(3)); - - ts_tree_cursor_goto_parent(&cursor); - child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, object_end_index - 1); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("object")); - AssertThat(ts_node_start_byte(node), Equals(object_index)); - AssertThat(child_index, Equals(5)); - - // There is no child past the end of the array - ts_tree_cursor_goto_parent(&cursor); - child_index = ts_tree_cursor_goto_first_child_for_byte(&cursor, array_end_index); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("array")); - AssertThat(ts_node_start_byte(node), Equals(array_index)); - AssertThat(child_index, Equals(-1)); - }); - - it("walks the tree correctly when the node contains aliased children and extras", [&]() { - ts_parser_set_language(parser, language_with_aliases_and_extras); - ts_tree_cursor_delete(&cursor); - ts_tree_delete(tree); - - tree = ts_parser_parse_string(parser, nullptr, "b ... b ... c", 13); - cursor = ts_tree_cursor_new(ts_tree_root_node(tree)); - - TSNode node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("a")); - - AssertThat(ts_tree_cursor_goto_first_child(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("b")); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("comment")); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("B")); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("comment")); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsTrue()); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("C")); - - AssertThat(ts_tree_cursor_goto_next_sibling(&cursor), IsFalse()); - AssertThat(ts_tree_cursor_goto_parent(&cursor), IsTrue()); - AssertThat(ts_tree_cursor_goto_first_child_for_byte(&cursor, 0), Equals(0)); - }); - - it("walks the tree correctly when there are hidden leaf nodes", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - ts_tree_cursor_delete(&cursor); - ts_tree_delete(tree); - - tree = ts_parser_parse_string(parser, nullptr, "`abc${1}def${2}ghi`", 19); - cursor = ts_tree_cursor_new(ts_tree_root_node(tree)); - TSNode node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("program")); - - ts_tree_cursor_goto_first_child(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("expression_statement")); - - ts_tree_cursor_goto_first_child(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("template_string")); - - int index = ts_tree_cursor_goto_first_child_for_byte(&cursor, 9); - AssertThat(index, Equals(2)); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("template_substitution")); - AssertThat(ts_node_start_byte(node), Equals(11u)); - AssertThat(ts_node_end_byte(node), Equals(15u)); - - index = ts_tree_cursor_goto_first_child_for_byte(&cursor, 20); - AssertThat(index, Equals(-1)); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("template_substitution")); - - ts_tree_cursor_goto_first_child(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("${")); - - index = ts_tree_cursor_goto_first_child_for_byte(&cursor, 20); - AssertThat(index, Equals(-1)); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("${")); - }); - - it("handles parent nodes that are aliased", [&]() { - ts_parser_set_language(parser, load_real_language("html")); - ts_tree_cursor_delete(&cursor); - ts_tree_delete(tree); - - tree = ts_parser_parse_string(parser, nullptr, "", 18); - - cursor = ts_tree_cursor_new(ts_tree_root_node(tree)); - TSNode node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("fragment")); - - ts_tree_cursor_goto_first_child(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("raw_element")); - - ts_tree_cursor_goto_first_child(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("start_tag")); - - ts_tree_cursor_goto_first_child(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("<")); - - ts_tree_cursor_goto_parent(&cursor); - node = ts_tree_cursor_current_node(&cursor); - AssertThat(ts_node_type(node), Equals("start_tag")); - }); -}); - -END_TEST diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc deleted file mode 100644 index 8766ca0b..00000000 --- a/test/runtime/parser_test.cc +++ /dev/null @@ -1,1232 +0,0 @@ -#include "test_helper.h" -#include -#include "runtime/alloc.h" -#include "runtime/language.h" -#include "runtime/get_changed_ranges.h" -#include "helpers/record_alloc.h" -#include "helpers/spy_input.h" -#include "helpers/load_language.h" -#include "helpers/record_alloc.h" -#include "helpers/point_helpers.h" -#include "helpers/spy_logger.h" -#include "helpers/stderr_logger.h" -#include "helpers/dedent.h" - -START_TEST - -describe("Parser", [&]() { - TSParser *parser; - TSTree *tree; - SpyInput *input; - TSNode root; - size_t chunk_size; - - before_each([&]() { - record_alloc::start(); - - chunk_size = 3; - input = nullptr; - tree = nullptr; - parser = ts_parser_new(); - if (getenv("TREE_SITTER_ENABLE_DEBUG_GRAPHS")) { - ts_parser_print_dot_graphs(parser, stderr); - } - }); - - after_each([&]() { - if (parser) ts_parser_delete(parser); - if (tree) ts_tree_delete(tree); - if (input) delete input; - - record_alloc::stop(); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - auto set_text = [&](string text) { - input = new SpyInput(text, chunk_size); - tree = ts_parser_parse(parser, nullptr, input->input()); - root = ts_tree_root_node(tree); - AssertThat(ts_node_end_byte(root), Equals(text.size())); - input->clear(); - }; - - auto replace_text = [&](size_t position, size_t length, string new_text) { - size_t prev_size = ts_node_end_byte(root); - - TSInputEdit edit = input->replace(position, length, new_text); - ts_tree_edit(tree, &edit); - TSTree *new_tree = ts_parser_parse(parser, tree, input->input()); - ts_tree_delete(tree); - tree = new_tree; - - root = ts_tree_root_node(tree); - size_t new_size = ts_node_end_byte(root); - AssertThat(new_size, Equals(prev_size - length + new_text.size())); - }; - - auto insert_text = [&](size_t position, string text) { - replace_text(position, 0, text); - }; - - auto delete_text = [&](size_t position, size_t length) { - replace_text(position, length, ""); - }; - - auto undo = [&]() { - TSInputEdit edit = input->undo(); - ts_tree_edit(tree, &edit); - TSTree *new_tree = ts_parser_parse(parser, tree, input->input()); - ts_tree_delete(tree); - tree = new_tree; - }; - - auto assert_root_node = [&](const string &expected) { - TSNode node = ts_tree_root_node(tree); - char *node_string = ts_node_string(node); - string actual(node_string); - ts_free(node_string); - AssertThat(actual, Equals(expected)); - }; - - auto get_node_text = [&](TSNode node) { - size_t start = ts_node_start_byte(node); - size_t end = ts_node_end_byte(node); - return input->content.substr(start, end - start); - }; - - describe("handling errors", [&]() { - describe("when there is an invalid substring right before a valid token", [&]() { - it("computes the error node's size and position correctly", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - set_text(" [123, @@@@@, true]"); - assert_root_node("(value (array (number) (ERROR (UNEXPECTED '@')) (true)))"); - - TSNode error = ts_node_named_child(ts_node_child(root, 0), 1); - AssertThat(ts_node_type(error), Equals("ERROR")); - AssertThat(get_node_text(error), Equals("@@@@@,")); - AssertThat(ts_node_child_count(error), Equals(2)); - - TSNode garbage = ts_node_child(error, 0); - AssertThat(get_node_text(garbage), Equals("@@@@@")); - - TSNode comma = ts_node_child(error, 1); - AssertThat(get_node_text(comma), Equals(",")); - - TSNode node_after_error = ts_node_next_named_sibling(error); - AssertThat(ts_node_type(node_after_error), Equals("true")); - AssertThat(get_node_text(node_after_error), Equals("true")); - }); - }); - - describe("when there is an unexpected string in the middle of a token", [&]() { - it("computes the error node's size and position correctly", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - set_text(" [123, faaaaalse, true]"); - - assert_root_node( - "(value (array (number) (ERROR (UNEXPECTED 'a')) (true)))"); - - TSNode error = ts_node_named_child(ts_node_child(root, 0), 1); - AssertThat(ts_node_type(error), Equals("ERROR")); - AssertThat(get_node_text(error), Equals("faaaaalse,")); - AssertThat(ts_node_child_count(error), Equals(2)); - - TSNode garbage = ts_node_child(error, 0); - AssertThat(ts_node_type(garbage), Equals("ERROR")); - AssertThat(get_node_text(garbage), Equals("faaaaalse")); - - TSNode comma = ts_node_child(error, 1); - AssertThat(ts_node_type(comma), Equals(",")); - AssertThat(get_node_text(comma), Equals(",")); - - TSNode last = ts_node_next_named_sibling(error); - AssertThat(ts_node_type(last), Equals("true")); - AssertThat(ts_node_start_byte(last), Equals(strlen(" [123, faaaaalse, "))); - }); - }); - - describe("when there is one unexpected token between two valid tokens", [&]() { - it("computes the error node's size and position correctly", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - set_text(" [123, true false, true]"); - - assert_root_node("(value (array (number) (true) (ERROR (false)) (true)))"); - - TSNode error = ts_node_named_child(ts_node_child(root, 0), 2); - AssertThat(ts_node_type(error), Equals("ERROR")); - AssertThat(get_node_text(error), Equals("false")); - AssertThat(ts_node_child_count(error), Equals(1)); - - TSNode last = ts_node_next_named_sibling(error); - AssertThat(ts_node_type(last), Equals("true")); - AssertThat(get_node_text(last), Equals("true")); - }); - }); - - describe("when there is an unexpected string at the end of a token", [&]() { - it("computes the error's size and position correctly", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - set_text(" [123, truue\n, true]"); - assert_root_node("(value (array (number) (ERROR (UNEXPECTED 'u')) (true)))"); - }); - }); - - describe("when there is an unterminated error", [&]() { - it("maintains a consistent tree", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - set_text("nul"); - assert_root_node("(ERROR (UNEXPECTED EOF))"); - }); - }); - - describe("when there are extra tokens at the end of the viable prefix", [&]() { - it("does not include them in the error node", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text( - "var x;\n" - "\n" - "if\n" - "\n" - "var y;" - ); - - TSNode error = ts_node_named_child(root, 1); - AssertThat(ts_node_type(error), Equals("ERROR")); - AssertThat(ts_node_start_point(error), Equals({2, 0})); - AssertThat(ts_node_end_point(error), Equals({2, 2})); - }); - }); - - it("handles invalid UTF8 characters at EOF", [&]() { - char *string = (char *)malloc(1); - string[0] = '\xdf'; - - ts_parser_set_language(parser, load_real_language("json")); - tree = ts_parser_parse_string(parser, nullptr, string, 1); - - free(string); - assert_root_node("(ERROR (UNEXPECTED INVALID))"); - }); - - describe("when halt_on_error is set to true", [&]() { - it("halts as soon as an error is found if the halt_on_error flag is set", [&]() { - string input_string = "[1, null, error, 3]"; - ts_parser_set_language(parser, load_real_language("json")); - - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - root = ts_tree_root_node(tree); - assert_root_node("(value (array (number) (null) (ERROR (UNEXPECTED 'e')) (number)))"); - - ts_parser_halt_on_error(parser, true); - - ts_tree_delete(tree); - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - root = ts_tree_root_node(tree); - assert_root_node("(ERROR (number) (null))"); - AssertThat(ts_node_end_byte(root), Equals(input_string.size())); - }); - - it("does not insert missing tokens if the halt_on_error flag is set", [&]() { - string input_string = "[1, null, 3"; - ts_parser_set_language(parser, load_real_language("json")); - - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - root = ts_tree_root_node(tree); - assert_root_node("(value (array (number) (null) (number) (MISSING)))"); - - ts_parser_halt_on_error(parser, true); - - ts_tree_delete(tree); - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - root = ts_tree_root_node(tree); - assert_root_node("(ERROR (number) (null) (number))"); - AssertThat(ts_node_end_byte(root), Equals(input_string.size())); - }); - - it("can parse valid code with the halt_on_error flag set", [&]() { - string input_string = "[1, null, 3]"; - ts_parser_set_language(parser, load_real_language("json")); - - ts_parser_halt_on_error(parser, true); - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - root = ts_tree_root_node(tree); - assert_root_node("(value (array (number) (null) (number)))"); - }); - }); - }); - - describe("editing", [&]() { - describe("creating new tokens near the end of the input", [&]() { - it("updates the parse tree and re-reads only the changed portion of the text", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("x * (100 + abc);"); - - assert_root_node( - "(program (expression_statement (binary_expression " - "(identifier) " - "(parenthesized_expression " - "(binary_expression (number) (identifier))))))"); - - insert_text(strlen("x * (100 + abc"), ".d"); - - assert_root_node( - "(program (expression_statement (binary_expression " - "(identifier) " - "(parenthesized_expression " - "(binary_expression (number) (member_expression (identifier) (property_identifier)))))))"); - - AssertThat(input->strings_read(), Equals(vector({ - // The '*' is not reused because the preceding `x` expression is reused, which - // puts the parser into a different state than when the `*` was initially tokenized. - // When the `*` was initially tokenized, `x` was just an identifier. In both of these - // states, external tokens are valid so we don't reuse tokens unless the lex states - // match. This could probably be improved somehow. - " * ", - " abc.d)" - }))); - }); - }); - - describe("creating new tokens near the beginning of the input", [&]() { - it("updates the parse tree and re-reads only the changed portion of the input", [&]() { - chunk_size = 2; - - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("123 + 456 * (10 + x);"); - - assert_root_node( - "(program (expression_statement (binary_expression " - "(number) " - "(binary_expression (number) (parenthesized_expression (binary_expression (number) (identifier)))))))"); - - insert_text(strlen("123"), " || 5"); - - assert_root_node( - "(program (expression_statement (binary_expression " - "(number) " - "(binary_expression " - "(number) " - "(binary_expression (number) (parenthesized_expression (binary_expression (number) (identifier))))))))"); - - AssertThat(input->strings_read(), Equals(vector({ - "123 || 5 ", - ";" - }))); - }); - }); - - describe("introducing an error", [&]() { - it("gives the error the right size", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("var x = y;"); - - assert_root_node( - "(program (variable_declaration (variable_declarator " - "(identifier) (identifier))))"); - - insert_text(strlen("var x = y"), " *"); - - assert_root_node( - "(program (variable_declaration (variable_declarator " - "(identifier) (identifier)) (ERROR)))"); - - insert_text(strlen("var x = y *"), " z"); - - assert_root_node( - "(program (variable_declaration (variable_declarator " - "(identifier) (binary_expression (identifier) (identifier)))))"); - }); - }); - - describe("into the middle of an existing token", [&]() { - it("updates the parse tree", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("abc * 123;"); - - assert_root_node( - "(program (expression_statement (binary_expression (identifier) (number))))"); - - insert_text(strlen("ab"), "XYZ"); - - assert_root_node( - "(program (expression_statement (binary_expression (identifier) (number))))"); - - TSNode node = ts_node_named_descendant_for_byte_range(root, 1, 1); - AssertThat(ts_node_type(node), Equals("identifier")); - AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc"))); - }); - }); - - describe("at the end of an existing token", [&]() { - it("updates the parse tree", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("abc * 123;"); - - assert_root_node( - "(program (expression_statement (binary_expression (identifier) (number))))"); - - insert_text(strlen("abc"), "XYZ"); - - assert_root_node( - "(program (expression_statement (binary_expression (identifier) (number))))"); - - TSNode node = ts_node_named_descendant_for_byte_range(root, 1, 1); - AssertThat(ts_node_type(node), Equals("identifier")); - AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ"))); - }); - }); - - describe("inserting text into a node containing a extra token", [&]() { - it("updates the parse tree", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("123 *\n" - "// a-comment\n" - "abc;"); - - assert_root_node( - "(program (expression_statement (binary_expression " - "(number) " - "(comment) " - "(identifier))))"); - - insert_text( - strlen("123 *\n" - "// a-comment\n" - "abc"), - "XYZ"); - - assert_root_node( - "(program (expression_statement (binary_expression " - "(number) " - "(comment) " - "(identifier))))"); - }); - }); - - describe("when a critical token is removed", [&]() { - it("updates the parse tree, creating an error", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("123 * 456; 789 * 123;"); - - assert_root_node( - "(program " - "(expression_statement (binary_expression (number) (number))) " - "(expression_statement (binary_expression (number) (number))))"); - - delete_text(strlen("123 "), 2); - - assert_root_node( - "(program " - "(ERROR (number)) " - "(expression_statement (number)) " - "(expression_statement (binary_expression (number) (number))))"); - }); - }); - - describe("with external tokens", [&]() { - it("maintains the external scanner's state during incremental parsing", [&]() { - ts_parser_set_language(parser, load_real_language("python")); - string text = dedent(R"PYTHON( - if a: - print b - return c - )PYTHON"); - - set_text(text); - assert_root_node("(module " - "(if_statement (identifier) " - "(print_statement (identifier))) " - "(return_statement (expression_list (identifier))))"); - - replace_text(text.find("return"), 0, " "); - assert_root_node("(module " - "(if_statement (identifier) " - "(print_statement (identifier)) " - "(return_statement (expression_list (identifier)))))"); - - undo(); - assert_root_node("(module " - "(if_statement (identifier) " - "(print_statement (identifier))) " - "(return_statement (expression_list (identifier))))"); - }); - }); - - describe("insertions at the end of the file", [&]() { - it("doesn't incorrectly reuse nodes at EOF", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - - set_text("ab"); - assert_root_node("(program (expression_statement (identifier)))"); - - insert_text(input->content.size(), " "); - insert_text(input->content.size(), "+="); - insert_text(input->content.size(), " "); - insert_text(input->content.size(), "12"); - assert_root_node("(program (expression_statement (augmented_assignment_expression (identifier) (number))))"); - }); - }); - - it("does not try to reuse nodes that are within the edited region", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("{ x: (b.c) };"); - - assert_root_node( - "(program (expression_statement (object (pair " - "(property_identifier) (parenthesized_expression (member_expression (identifier) (property_identifier)))))))"); - - replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c"); - - assert_root_node( - "(program (expression_statement (object (pair " - "(property_identifier) (member_expression (identifier) (property_identifier))))))"); - }); - }); - - describe("lexing", [&]() { - describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { - it("terminates them at the end of the string", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("x; // this is a comment"); - - assert_root_node( - "(program (expression_statement (identifier)) (comment))"); - - TSNode comment = ts_node_named_child(root, 1); - - AssertThat(ts_node_start_byte(comment), Equals(strlen("x; "))); - AssertThat(ts_node_end_byte(comment), Equals(strlen("x; // this is a comment"))); - }); - }); - - it("recognizes UTF8 characters as single characters", [&]() { - // 'ΩΩΩ — ΔΔ'; - ts_parser_set_language(parser, load_real_language("javascript")); - set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"); - - assert_root_node( - "(program (expression_statement (string)))"); - - AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"))); - }); - - it("handles non-UTF8 characters", [&]() { - const char *string = "cons\xeb\x00e=ls\x83l6hi');\x0a"; - - ts_parser_set_language(parser, load_real_language("javascript")); - tree = ts_parser_parse_string(parser, nullptr, string, strlen(string)); - TSNode root = ts_tree_root_node(tree); - AssertThat(ts_node_end_byte(root), Equals(strlen(string))); - }); - }); - - describe("handling TSInputs", [&]() { - SpyInput *spy_input; - - before_each([&]() { - spy_input = new SpyInput("{\"key\": [null, 2]}", 3); - ts_parser_set_language(parser, load_real_language("json")); - }); - - after_each([&]() { - delete spy_input; - }); - - it("handles UTF16 encodings", [&]() { - const char16_t content[] = u"[true, false]"; - spy_input->content = string((const char *)content, sizeof(content)); - spy_input->encoding = TSInputEncodingUTF16; - - tree = ts_parser_parse(parser, nullptr, spy_input->input()); - root = ts_tree_root_node(tree); - assert_root_node( - "(value (array (true) (false)))"); - }); - - it("handles truncated UTF16 data", [&]() { - const char content[1] = { '\0' }; - spy_input->content = string(content, sizeof(content)); - spy_input->encoding = TSInputEncodingUTF16; - - tree = ts_parser_parse(parser, nullptr, spy_input->input()); - }); - - it("measures columns in bytes", [&]() { - const char16_t content[] = u"[true, false]"; - spy_input->content = string((const char *)content, sizeof(content)); - spy_input->encoding = TSInputEncodingUTF16; - - tree = ts_parser_parse(parser, nullptr, spy_input->input()); - root = ts_tree_root_node(tree); - AssertThat(ts_node_end_point(root), Equals({0, 28})); - }); - - it("handles input chunks that end in the middle of multi-byte characters", [&]() { - ts_parser_set_language(parser, load_real_language("c")); - spy_input->content = "A b = {'👍','👍'};"; - spy_input->chars_per_chunk = 4; - - tree = ts_parser_parse(parser, nullptr, spy_input->input()); - root = ts_tree_root_node(tree); - assert_root_node( - "(translation_unit (declaration " - "(type_identifier) " - "(init_declarator " - "(identifier) " - "(initializer_list (char_literal) (char_literal)))))"); - }); - }); - - describe("set_language(language)", [&]() { - string input_string = "{\"key\": [1, 2]}\n"; - - it("uses the given language for future parses", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - tree = ts_parser_parse_string(parser, nullptr, input_string.c_str(), input_string.size()); - - root = ts_tree_root_node(tree); - assert_root_node( - "(value (object (pair (string) (array (number) (number)))))"); - }); - - it("does not allow setting a language with a different version number", [&]() { - TSLanguage language = *load_real_language("json"); - AssertThat(ts_language_version(&language), Equals(TREE_SITTER_LANGUAGE_VERSION)); - - language.version++; - AssertThat(ts_language_version(&language), !Equals(TREE_SITTER_LANGUAGE_VERSION)); - - AssertThat(ts_parser_set_language(parser, &language), IsFalse()); - AssertThat(ts_parser_language(parser), Equals(nullptr)); - }); - - it("does nothing when parse is called while the language is null", [&]() { - tree = ts_parser_parse_string(parser, nullptr, "{}", 2); - AssertThat(tree, Equals(nullptr)); - - ts_parser_set_language(parser, nullptr); - tree = ts_parser_parse_string(parser, nullptr, "{}", 2); - AssertThat(tree, Equals(nullptr)); - }); - }); - - describe("set_logger(TSLogger)", [&]() { - SpyLogger *logger; - - before_each([&]() { - logger = new SpyLogger(); - ts_parser_set_language(parser, load_real_language("json")); - }); - - after_each([&]() { - delete logger; - }); - - it("calls the debugger with a message for each parse action", [&]() { - ts_parser_set_logger(parser, logger->logger()); - tree = ts_parser_parse_string(parser, nullptr, "[ 1, 2, 3 ]", 11); - - AssertThat(logger->messages, Contains("new_parse")); - AssertThat(logger->messages, Contains("skip character:' '")); - AssertThat(logger->messages, Contains("consume character:'['")); - AssertThat(logger->messages, Contains("consume character:'1'")); - AssertThat(logger->messages, Contains("reduce sym:array, child_count:4")); - AssertThat(logger->messages, Contains("accept")); - }); - - it("allows the debugger to be retrieved later", [&]() { - ts_parser_set_logger(parser, logger->logger()); - AssertThat(ts_parser_logger(parser).payload, Equals(logger)); - }); - - describe("disabling debugging", [&]() { - before_each([&]() { - ts_parser_set_logger(parser, logger->logger()); - ts_parser_set_logger(parser, {NULL, NULL}); - }); - - it("does not call the debugger any more", [&]() { - tree = ts_parser_parse_string(parser, nullptr, "{}", 2); - AssertThat(logger->messages, IsEmpty()); - }); - }); - }); - - describe("set_enabled(enabled)", [&]() { - it("stops the in-progress parse if false is passed", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - AssertThat(ts_parser_enabled(parser), IsTrue()); - - auto tree_future = std::async([parser]() { - size_t read_count = 0; - TSInput infinite_input = { - &read_count, - [](void *payload, uint32_t byte, TSPoint position, uint32_t *bytes_read) { - size_t *read_count = static_cast(payload); - assert((*read_count)++ < 100000); - *bytes_read = 1; - return "["; - }, - TSInputEncodingUTF8 - }; - - return ts_parser_parse(parser, nullptr, infinite_input); - }); - - auto cancel_future = std::async([parser]() { - ts_parser_set_enabled(parser, false); - }); - - cancel_future.wait(); - tree_future.wait(); - AssertThat(ts_parser_enabled(parser), IsFalse()); - AssertThat(tree_future.get(), Equals(nullptr)); - - TSTree *tree = ts_parser_parse_string(parser, nullptr, "[]", 2); - AssertThat(ts_parser_enabled(parser), IsFalse()); - AssertThat(tree, Equals(nullptr)); - - ts_parser_set_enabled(parser, true); - AssertThat(ts_parser_enabled(parser), IsTrue()); - tree = ts_parser_parse_string(parser, nullptr, "[]", 2); - AssertThat(tree, !Equals(nullptr)); - ts_tree_delete(tree); - }); - }); - - describe("set_operation_limit(limit)", [&]() { - it("limits the amount of work the parser does on any given call to parse()", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - - struct InputState { - const char *string; - size_t read_count; - }; - - InputState state = {"[", 0}; - - // An input that repeats the given string forever, counting how many times - // it has been read. - TSInput infinite_input = { - &state, - [](void *payload, uint32_t byte, TSPoint position, uint32_t *bytes_read) { - InputState *state = static_cast(payload); - assert(state->read_count++ <= 11); - *bytes_read = strlen(state->string); - return state->string; - }, - TSInputEncodingUTF8 - }; - - ts_parser_set_operation_limit(parser, 10); - TSTree *tree = ts_parser_parse(parser, nullptr, infinite_input); - AssertThat(tree, Equals(nullptr)); - - state.read_count = 0; - state.string = ""; - - tree = ts_parser_parse(parser, nullptr, infinite_input); - AssertThat(tree, !Equals(nullptr)); - ts_tree_delete(tree); - }); - - it("retains the old tree even if the parser halts before finishing parsing", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - - SpyInput input("[1234, 5, 6, 4, 5]", 3); - tree = ts_parser_parse(parser, nullptr, input.input()); - assert_root_node("(value (array (number) (number) (number) (number) (number)))"); - - input.clear(); - TSInputEdit edit = input.replace(1, 4, "null"); - ts_tree_edit(tree, &edit); - - ts_parser_set_operation_limit(parser, 1); - TSTree *new_tree = ts_parser_parse(parser, tree, input.input()); - AssertThat(new_tree, Equals(nullptr)); - - ts_tree_delete(tree); - ts_parser_set_operation_limit(parser, SIZE_MAX); - tree = ts_parser_parse(parser, nullptr, input.input()); - assert_root_node("(value (array (null) (number) (number) (number) (number)))"); - - AssertThat(input.strings_read(), Equals(vector({ - "[null,", - }))); - }); - - it("does not leak the old tree if parsing halts and never finishes", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - - SpyInput input("[1234, 5, 6, 4, 5]", 3); - tree = ts_parser_parse(parser, nullptr, input.input()); - assert_root_node("(value (array (number) (number) (number) (number) (number)))"); - - input.clear(); - TSInputEdit edit = input.replace(1, 4, "null"); - ts_tree_edit(tree, &edit); - - ts_parser_set_operation_limit(parser, 1); - TSTree *new_tree = ts_parser_parse(parser, tree, input.input()); - AssertThat(new_tree, Equals(nullptr)); - }); - }); - - describe("reset()", [&]() { - it("causes the parser to parse from scratch on the next call to parse, instead of resuming", [&]() { - ts_parser_set_language(parser, load_real_language("json")); - - ts_parser_set_operation_limit(parser, 3); - tree = ts_parser_parse_string(parser, nullptr, "[1234, 5, 6, 4, 5]", 18); - AssertThat(tree, Equals(nullptr)); - - // Without calling reset, the parser continues from where it left off, so - // it does not see the changes to the beginning of the source code. - ts_parser_set_operation_limit(parser, SIZE_MAX); - tree = ts_parser_parse_string(parser, nullptr, "[null, 5, 6, 4, 5]", 18); - assert_root_node("(value (array (number) (number) (number) (number) (number)))"); - ts_tree_delete(tree); - - ts_parser_set_operation_limit(parser, 3); - tree = ts_parser_parse_string(parser, nullptr, "[1234, 5, 6, 4, 5]", 18); - AssertThat(tree, Equals(nullptr)); - - // By calling reset, we force the parser to start over from scratch so - // that it sees the changes to the beginning of the source code. - ts_parser_set_operation_limit(parser, SIZE_MAX); - ts_parser_reset(parser); - tree = ts_parser_parse_string(parser, nullptr, "[null, 5, 6, 4, 5]", 18); - assert_root_node("(value (array (null) (number) (number) (number) (number)))"); - }); - }); - - describe("set_included_ranges()", [&]() { - it("can parse code within a single range of a document", [&]() { - string source_code = "hi"; - - ts_parser_set_language(parser, load_real_language("html")); - TSTree *html_tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - TSNode script_content_node = ts_node_child( - ts_node_child(ts_tree_root_node(html_tree), 1), - 1 - ); - AssertThat(ts_node_type(script_content_node), Equals("raw_text")); - TSRange included_range = { - ts_node_start_point(script_content_node), - ts_node_end_point(script_content_node), - ts_node_start_byte(script_content_node), - ts_node_end_byte(script_content_node), - }; - ts_tree_delete(html_tree); - - ts_parser_set_included_ranges(parser, &included_range, 1); - ts_parser_set_language(parser, load_real_language("javascript")); - tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - - assert_root_node("(program (expression_statement (call_expression " - "(member_expression (identifier) (property_identifier)) " - "(arguments (string)))))"); - - AssertThat( - ts_node_start_point(ts_tree_root_node(tree)), - Equals({0, static_cast(source_code.find("console"))}) - ); - }); - - it("can parse code spread across multiple ranges in a document", [&]() { - string source_code = - "html `

Hello, ${name.toUpperCase()}, it's ${now()}.
`"; - - ts_parser_set_language(parser, load_real_language("javascript")); - TSTree *js_tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - TSNode root_node = ts_tree_root_node(js_tree); - TSNode string_node = ts_node_descendant_for_byte_range( - root_node, - source_code.find("
"), - source_code.find("Hell") - ); - TSNode open_quote_node = ts_node_child(string_node, 0); - TSNode interpolation_node1 = ts_node_child(string_node, 1); - TSNode interpolation_node2 = ts_node_child(string_node, 2); - TSNode close_quote_node = ts_node_child(string_node, 3); - - AssertThat(ts_node_type(string_node), Equals("template_string")); - AssertThat(ts_node_type(open_quote_node), Equals("`")); - AssertThat(ts_node_type(interpolation_node1), Equals("template_substitution")); - AssertThat(ts_node_type(interpolation_node2), Equals("template_substitution")); - AssertThat(ts_node_type(close_quote_node), Equals("`")); - - TSRange included_ranges[] = { - { - ts_node_end_point(open_quote_node), - ts_node_start_point(interpolation_node1), - ts_node_end_byte(open_quote_node), - ts_node_start_byte(interpolation_node1), - }, - { - ts_node_end_point(interpolation_node1), - ts_node_start_point(interpolation_node2), - ts_node_end_byte(interpolation_node1), - ts_node_start_byte(interpolation_node2), - }, - { - ts_node_end_point(interpolation_node2), - ts_node_start_point(close_quote_node), - ts_node_end_byte(interpolation_node2), - ts_node_start_byte(close_quote_node), - } - }; - - ts_parser_set_included_ranges(parser, included_ranges, 3); - ts_tree_delete(js_tree); - ts_parser_set_language(parser, load_real_language("html")); - tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - - assert_root_node("(fragment " - "(element " - "(start_tag (tag_name)) " - "(text) " - "(element " - "(start_tag (tag_name)) " - "(end_tag (tag_name))) " - "(text) " - "(end_tag (tag_name))))"); - - root_node = ts_tree_root_node(tree); - TSNode div_element_node = ts_node_child(root_node, 0); - - TSNode hello_text_node = ts_node_child(div_element_node, 1); - AssertThat(ts_node_type(hello_text_node), Equals("text")); - AssertThat( - ts_node_start_point(hello_text_node), - Equals({0, static_cast(source_code.find("Hello"))}) - ); - AssertThat( - ts_node_end_point(hello_text_node), - Equals({0, static_cast(source_code.find(""))}) - ); - - TSNode b_start_tag_node = ts_node_child(ts_node_child(div_element_node, 2), 0); - AssertThat(ts_node_type(b_start_tag_node), Equals("start_tag")); - AssertThat( - ts_node_start_point(b_start_tag_node), - Equals({0, static_cast(source_code.find(""))}) - ); - AssertThat( - ts_node_end_point(b_start_tag_node), - Equals({0, static_cast(source_code.find("${now()}"))}) - ); - - TSNode b_end_tag_node = ts_node_child(ts_node_child(div_element_node, 2), 1); - AssertThat(ts_node_type(b_end_tag_node), Equals("end_tag")); - AssertThat( - ts_node_start_point(b_end_tag_node), - Equals({0, static_cast(source_code.find(""))}) - ); - AssertThat( - ts_node_end_point(b_end_tag_node), - Equals({0, static_cast(source_code.find(".
"))}) - ); - }); - - it("can handle errors at the ends of the nested UTF16 documents (regression)", [&]() { - u16string source_code = u""; - - TSRange included_range = { - {0, static_cast(2u * source_code.find(u"a."))}, - {0, static_cast(2u * source_code.find(u"(source_code.find(u"a.")), - 2u * static_cast(source_code.find(u"({0, 2})); - AssertThat(ts_node_start_point(ts_node_child(root, 3)), Equals({0, 4})); - }); - - it("allows external scanners to detect the boundaries of included ranges", [&]() { - string source_code = "a <%= b() %> c <% d() %>"; - - TSRange included_ranges[] = { - range_for_substring(source_code, "b()"), - range_for_substring(source_code, "d()"), - }; - - ts_parser_set_included_ranges(parser, included_ranges, 2); - ts_parser_set_language(parser, load_real_language("javascript")); - tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - - assert_root_node("(program " - "(expression_statement (call_expression (identifier) (arguments))) " - "(expression_statement (call_expression (identifier) (arguments))))"); - - TSNode statement_node1 = ts_node_child(ts_tree_root_node(tree), 0); - TSNode statement_node2 = ts_node_child(ts_tree_root_node(tree), 1); - - AssertThat(ts_node_end_point(statement_node1), Equals(extent_for_string("a <%= b()"))); - AssertThat(ts_node_end_point(statement_node2), Equals(extent_for_string("a <%= b() %> c <% d()"))); - }); - - it("handles syntax changes in ranges that were included but are now excluded", [&]() { - string source_code = "
<%= something %>
"; - - // Parse HTML including the template directive, which will cause an error - ts_parser_set_language(parser, load_real_language("html")); - TSTree *first_tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - - // Insert code at the beginning of the document. - string prefix = "a very very long line of plain text. "; - unsigned prefix_length = prefix.size(); - TSInputEdit edit = { - 0, 0, prefix_length, - {0, 0}, {0, 0}, {0, prefix_length} - }; - ts_tree_edit(first_tree, &edit); - source_code = prefix + source_code; - - // Parse the HTML again, this time *excluding* the template directive - // (which has moved since the previous parse). - unsigned directive_start = source_code.find("<%="); - unsigned directive_end = source_code.find(""); - unsigned source_code_end = source_code.size(); - - TSRange included_ranges[] = { - { - {0, 0}, - {0, directive_start}, - 0, - directive_start - }, - { - {0, directive_end}, - {0, source_code_end}, - directive_end, - source_code_end - } - }; - - ts_parser_set_included_ranges(parser, included_ranges, 2); - tree = ts_parser_parse_string(parser, first_tree, source_code.c_str(), source_code.size()); - - // The element node (which contained an error) should not be reused, - // because it contains a range which is now excluded. - assert_root_node("(fragment " - "(text) " - "(element " - "(start_tag (tag_name)) " - "(element " - "(start_tag (tag_name)) " - "(end_tag (tag_name))) " - "(end_tag (tag_name))))"); - - unsigned range_count; - const TSRange *ranges = ts_tree_get_changed_ranges(first_tree, tree, &range_count); - - // The first range that's changed syntax is the range of the - // newly-inserted text. - AssertThat(range_count, Equals(2u)); - AssertThat(ranges[0], Equals({ - {0, 0}, {0, prefix_length}, - 0, prefix_length, - })); - - // Even though no edits were applied to the outer `div` element, - // its contents have changed syntax because a range of text that - // was previously included is now excluded. - AssertThat(ranges[1], Equals({ - {0, directive_start}, {0, directive_end}, - directive_start, directive_end, - })); - - ts_free((void *)ranges); - ts_tree_delete(first_tree); - }); - - it("handles syntax changes in ranges that were excluded but are now included", [&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - - string source_code = "
<%= foo() %>
<%= bar() %>"; - - unsigned first_code_start_index = source_code.find(" foo"); - unsigned first_code_end_index = first_code_start_index + 7; - unsigned second_code_start_index = source_code.find(" bar"); - unsigned second_code_end_index = second_code_start_index + 7; - - TSRange included_ranges[] = { - { - {0, first_code_start_index}, - {0, first_code_end_index}, - first_code_start_index, - first_code_end_index - }, - { - {0, second_code_start_index}, - {0, second_code_end_index}, - second_code_start_index, - second_code_end_index - }, - }; - - // Parse only the first code directive as JavaScript - ts_parser_set_included_ranges(parser, included_ranges, 1); - TSTree *first_tree = ts_parser_parse_string(parser, nullptr, source_code.c_str(), source_code.size()); - - // Parse both the code directives as JavaScript, using the old tree as a reference. - ts_parser_set_included_ranges(parser, included_ranges, 2); - tree = ts_parser_parse_string(parser, first_tree, source_code.c_str(), source_code.size()); - - assert_root_node("(program " - "(expression_statement (call_expression (identifier) (arguments))) " - "(expression_statement (call_expression (identifier) (arguments))))"); - - unsigned range_count; - const TSRange *ranges = ts_tree_get_changed_ranges(first_tree, tree, &range_count); - AssertThat(range_count, Equals(1u)); - AssertThat(ranges[0], Equals({ - {0, first_code_end_index + 1}, {0, second_code_end_index + 1}, - first_code_end_index + 1, second_code_end_index + 1, - })); - - ts_free((void *)ranges); - ts_tree_delete(first_tree); - }); - }); - - describe("ts_range_array_get_changed_ranges()", [&]() { - auto get_changed_ranges = [&]( - const vector &old_ranges, - const vector &new_ranges - ) { - TSRangeArray result = array_new(); - ts_range_array_get_changed_ranges( - old_ranges.data(), old_ranges.size(), - new_ranges.data(), new_ranges.size(), - &result - ); - vector result_vector; - for (unsigned i = 0; i < result.size; i++) { - result_vector.push_back(result.contents[i]); - } - array_delete(&result); - return result_vector; - }; - - auto range = [&](unsigned start, unsigned end) { - TSRange result; - result.start_byte = start; - result.end_byte = end; - result.start_point = {0, start}; - if (end == UINT32_MAX) { - result.end_point = {UINT32_MAX, UINT32_MAX}; - } else { - result.end_point = {0, end}; - } - return result; - }; - - it("returns an array of ranges that are newly included excluded", [&]() { - AssertThat(get_changed_ranges( - { - range(0, UINT32_MAX), - }, - { - range(0, 5), - range(8, UINT32_MAX), - } - ), Equals>( - { - range(5, 8) - } - )); - - AssertThat(get_changed_ranges( - { - range(0, 3), - range(7, 10), - range(13, 30), - }, - { - range(0, 4), - range(8, 11), - range(14, 30), - } - ), Equals>( - { - range(3, 4), - range(7, 8), - range(10, 11), - range(13, 14), - } - )); - - AssertThat(get_changed_ranges( - { - range(0, UINT32_MAX), - }, - { - range(0, 4), - range(5, 64), - } - ), Equals>( - { - range(4, 5), - range(64, UINT32_MAX), - } - )); - }); - }); -}); - -END_TEST diff --git a/test/runtime/stack_test.cc b/test/runtime/stack_test.cc deleted file mode 100644 index aa710237..00000000 --- a/test/runtime/stack_test.cc +++ /dev/null @@ -1,612 +0,0 @@ -#include "test_helper.h" -#include "helpers/tree_helpers.h" -#include "helpers/point_helpers.h" -#include "helpers/record_alloc.h" -#include "helpers/stream_methods.h" -#include "runtime/stack.h" -#include "runtime/subtree.h" -#include "runtime/length.h" -#include "runtime/alloc.h" - -enum { - stateA = 2, - stateB, - stateC, stateD, stateE, stateF, stateG, stateH, stateI, stateJ -}; - -enum { - symbol0, symbol1, symbol2, symbol3, symbol4, symbol5, symbol6, symbol7, symbol8, - symbol9, symbol10 -}; - -void free_slice_array(SubtreePool *pool, StackSliceArray *slices) { - for (size_t i = 0; i < slices->size; i++) { - StackSlice slice = slices->contents[i]; - - bool matches_prior_trees = false; - for (size_t j = 0; j < i; j++) { - StackSlice prior_slice = slices->contents[j]; - if (slice.subtrees.contents == prior_slice.subtrees.contents) { - matches_prior_trees = true; - break; - } - } - - if (!matches_prior_trees) { - for (size_t j = 0; j < slice.subtrees.size; j++) - ts_subtree_release(pool, slice.subtrees.contents[j]); - array_delete(&slice.subtrees); - } - } -} - -SubtreeHeapData *mutate(Subtree subtree) { - return ts_subtree_to_mut_unsafe(subtree).ptr; -} - -struct StackEntry { - TSStateId state; - size_t depth; -}; - -vector get_stack_entries(Stack *stack, StackVersion version) { - vector result; - ts_stack_iterate( - stack, - version, - [](void *payload, TSStateId state, uint32_t subtree_count) { - auto entries = static_cast *>(payload); - StackEntry entry = {state, subtree_count}; - if (find(entries->begin(), entries->end(), entry) == entries->end()) { - entries->push_back(entry); - } - }, &result); - return result; -} - -START_TEST - -describe("Stack", [&]() { - Stack *stack; - const size_t subtree_count = 11; - Subtree subtrees[subtree_count]; - Length tree_len = {3, {0, 3}}; - SubtreePool pool; - - before_each([&]() { - record_alloc::start(); - - pool = ts_subtree_pool_new(10); - stack = ts_stack_new(&pool); - - TSLanguage dummy_language; - TSSymbolMetadata symbol_metadata[50] = {}; - dummy_language.symbol_metadata = symbol_metadata; - - for (size_t i = 0; i < subtree_count; i++) { - subtrees[i] = ts_subtree_new_leaf( - &pool, i + 1, length_zero(), tree_len, 0, - TS_TREE_STATE_NONE, true, false, &dummy_language - ); - ts_external_scanner_state_init(&mutate(subtrees[i])->external_scanner_state, nullptr, 0); - } - }); - - after_each([&]() { - ts_stack_delete(stack); - for (size_t i = 0; i < subtree_count; i++) { - ts_subtree_release(&pool, subtrees[i]); - } - ts_subtree_pool_delete(&pool); - - record_alloc::stop(); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - auto push = [&](StackVersion version, Subtree tree, TSStateId state) { - ts_subtree_retain(tree); - ts_stack_push(stack, version, tree, false, state); - }; - - describe("push(version, tree, is_pending, state)", [&]() { - it("adds entries to the given version of the stack", [&]() { - AssertThat(ts_stack_version_count(stack), Equals(1)); - AssertThat(ts_stack_state(stack, 0), Equals(1)); - AssertThat(ts_stack_position(stack, 0), Equals(length_zero())); - - // . <──0── A* - push(0, subtrees[0], stateA); - AssertThat(ts_stack_state(stack, 0), Equals(stateA)); - AssertThat(ts_stack_position(stack, 0), Equals(tree_len)); - - // . <──0── A <──1── B* - push(0, subtrees[1], stateB); - AssertThat(ts_stack_state(stack, 0), Equals(stateB)); - AssertThat(ts_stack_position(stack, 0), Equals(tree_len * 2)); - - // . <──0── A <──1── B <──2── C* - push(0, subtrees[2], stateC); - AssertThat(ts_stack_state(stack, 0), Equals(stateC)); - AssertThat(ts_stack_position(stack, 0), Equals(tree_len * 3)); - - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateC, 0}, - {stateB, 1}, - {stateA, 2}, - {1, 3}, - }))); - }); - }); - - describe("merge()", [&]() { - before_each([&]() { - // . <──0── A <─* - // ↑ - // └───* - push(0, subtrees[0], stateA); - ts_stack_copy_version(stack, 0); - }); - - it("combines versions that have the same top states and positions", [&]() { - // . <──0── A <──1── B <──3── D* - // ↑ - // └───2─── C <──4── D* - push(0, subtrees[1], stateB); - push(1, subtrees[2], stateC); - push(0, subtrees[3], stateD); - push(1, subtrees[4], stateD); - - // . <──0── A <──1── B <──3── D* - // ↑ | - // └───2─── C <──4───┘ - AssertThat(ts_stack_merge(stack, 0, 1), IsTrue()); - AssertThat(ts_stack_version_count(stack), Equals(1)); - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateD, 0}, - {stateB, 1}, - {stateC, 1}, - {stateA, 2}, - {1, 3}, - }))); - }); - - it("does not combine versions that have different states", [&]() { - // . <──0── A <──1── B* - // ↑ - // └───2─── C* - push(0, subtrees[1], stateB); - push(1, subtrees[2], stateC); - - AssertThat(ts_stack_merge(stack, 0, 1), IsFalse()); - AssertThat(ts_stack_version_count(stack), Equals(2)); - }); - - it("does not combine versions that have different positions", [&]() { - // . <──0── A <──1── B <────3──── D* - // ↑ - // └───2─── C <──4── D* - mutate(subtrees[3])->size = tree_len * 3; - push(0, subtrees[1], stateB); - push(1, subtrees[2], stateC); - push(0, subtrees[3], stateD); - push(1, subtrees[4], stateD); - - AssertThat(ts_stack_merge(stack, 0, 1), IsFalse()); - AssertThat(ts_stack_version_count(stack), Equals(2)); - }); - - describe("when the merged versions have more than one common entry", [&]() { - it("combines all of the top common entries", [&]() { - // . <──0── A <──1── B <──3── D <──5── E* - // ↑ - // └───2─── C <──4── D <──5── E* - push(0, subtrees[1], stateB); - push(1, subtrees[2], stateC); - push(0, subtrees[3], stateD); - push(1, subtrees[4], stateD); - push(0, subtrees[5], stateE); - push(1, subtrees[5], stateE); - - // . <──0── A <──1── B <──3── D <──5── E* - // ↑ | - // └───2─── C <──4───┘ - AssertThat(ts_stack_merge(stack, 0, 1), IsTrue()); - AssertThat(ts_stack_version_count(stack), Equals(1)); - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateE, 0}, - {stateD, 1}, - {stateB, 2}, - {stateC, 2}, - {stateA, 3}, - {1, 4}, - }))); - }); - }); - - describe("when one of the versions contains an extra (e.g. ERROR) tree of size zero", [&]() { - it("does not create a loop in the stack", [&]() { - // . <──0── A <────1──── B* - // ↑ - // └2─ A <──1── B* - mutate(subtrees[2])->extra = true; - mutate(subtrees[2])->size = tree_len * 0; - - push(0, subtrees[1], stateB); - push(1, subtrees[2], stateA); - push(1, subtrees[1], stateB); - - // . <──0── A <──1── B* - AssertThat(ts_stack_merge(stack, 0, 1), IsTrue()); - AssertThat(ts_stack_version_count(stack), Equals(1)); - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateB, 0}, - {stateA, 1}, - {1, 2}, - }))); - }); - }); - }); - - describe("pop_count(version, count)", [&]() { - before_each([&]() { - // . <──0── A <──1── B <──2── C* - push(0, subtrees[0], stateA); - push(0, subtrees[1], stateB); - push(0, subtrees[2], stateC); - }); - - it("creates a new version with the given number of entries removed", [&]() { - // . <──0── A <──1── B <──2── C* - // ↑ - // └─* - StackSliceArray pop = ts_stack_pop_count(stack, 0, 2); - AssertThat(pop.size, Equals(1)); - AssertThat(ts_stack_version_count(stack), Equals(2)); - - StackSlice slice = pop.contents[0]; - AssertThat(slice.version, Equals(1)); - AssertThat(slice.subtrees, Equals(vector({ subtrees[1], subtrees[2] }))); - AssertThat(ts_stack_state(stack, 1), Equals(stateA)); - - free_slice_array(&pool,&pop); - }); - - it("does not count 'extra' subtrees toward the given count", [&]() { - mutate(subtrees[1])->extra = true; - - // . <──0── A <──1── B <──2── C* - // ↑ - // └─* - StackSliceArray pop = ts_stack_pop_count(stack, 0, 2); - AssertThat(pop.size, Equals(1)); - - StackSlice slice = pop.contents[0]; - AssertThat(slice.subtrees, Equals(vector({ subtrees[0], subtrees[1], subtrees[2] }))); - AssertThat(ts_stack_state(stack, 1), Equals(1)); - - free_slice_array(&pool,&pop); - }); - - describe("when the version has been merged", [&]() { - before_each([&]() { - // . <──0── A <──1── B <──2── C <──3── D <──10── I* - // ↑ | - // └───4─── E <──5── F <──6───┘ - push(0, subtrees[3], stateD); - StackSliceArray pop = ts_stack_pop_count(stack, 0, 3); - free_slice_array(&pool,&pop); - push(1, subtrees[4], stateE); - push(1, subtrees[5], stateF); - push(1, subtrees[6], stateD); - ts_stack_merge(stack, 0, 1); - push(0, subtrees[10], stateI); - - AssertThat(ts_stack_version_count(stack), Equals(1)); - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateI, 0}, - {stateD, 1}, - {stateC, 2}, - {stateF, 2}, - {stateB, 3}, - {stateE, 3}, - {stateA, 4}, - {1, 5}, - }))); - }); - - describe("when there are two paths that reveal different versions", [&]() { - it("returns an entry for each revealed version", [&]() { - // . <──0── A <──1── B <──2── C <──3── D <──10── I* - // ↑ ↑ - // | └* - // | - // └───4─── E* - StackSliceArray pop = ts_stack_pop_count(stack, 0, 3); - AssertThat(pop.size, Equals(2)); - - StackSlice slice1 = pop.contents[0]; - AssertThat(slice1.version, Equals(1)); - AssertThat(slice1.subtrees, Equals(vector({ subtrees[2], subtrees[3], subtrees[10] }))); - - StackSlice slice2 = pop.contents[1]; - AssertThat(slice2.version, Equals(2)); - AssertThat(slice2.subtrees, Equals(vector({ subtrees[5], subtrees[6], subtrees[10] }))); - - AssertThat(ts_stack_version_count(stack), Equals(3)); - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateI, 0}, - {stateD, 1}, - {stateC, 2}, - {stateF, 2}, - {stateB, 3}, - {stateE, 3}, - {stateA, 4}, - {1, 5}, - }))); - AssertThat(get_stack_entries(stack, 1), Equals(vector({ - {stateB, 0}, - {stateA, 1}, - {1, 2}, - }))); - AssertThat(get_stack_entries(stack, 2), Equals(vector({ - {stateE, 0}, - {stateA, 1}, - {1, 2}, - }))); - - free_slice_array(&pool,&pop); - }); - }); - - describe("when there is one path that ends at a merged version", [&]() { - it("returns a single entry", [&]() { - // . <──0── A <──1── B <──2── C <──3── D <──10── I* - // | | - // └───5─── F <──6── G <──7───┘ - // | - // └* - StackSliceArray pop = ts_stack_pop_count(stack, 0, 1); - AssertThat(pop.size, Equals(1)); - - StackSlice slice1 = pop.contents[0]; - AssertThat(slice1.version, Equals(1)); - AssertThat(slice1.subtrees, Equals(vector({ subtrees[10] }))); - - AssertThat(ts_stack_version_count(stack), Equals(2)); - AssertThat(ts_stack_state(stack, 0), Equals(stateI)); - AssertThat(ts_stack_state(stack, 1), Equals(stateD)); - - free_slice_array(&pool,&pop); - }); - }); - - describe("when there are two paths that converge on one version", [&]() { - it("returns two slices with the same version", [&]() { - // . <──0── A <──1── B <──2── C <──3── D <──10── I* - // ↑ | - // ├───4─── E <──5── F <──6───┘ - // | - // └* - StackSliceArray pop = ts_stack_pop_count(stack, 0, 4); - AssertThat(pop.size, Equals(2)); - - StackSlice slice1 = pop.contents[0]; - AssertThat(slice1.version, Equals(1)); - AssertThat(slice1.subtrees, Equals(vector({ subtrees[1], subtrees[2], subtrees[3], subtrees[10] }))); - - StackSlice slice2 = pop.contents[1]; - AssertThat(slice2.version, Equals(1)); - AssertThat(slice2.subtrees, Equals(vector({ subtrees[4], subtrees[5], subtrees[6], subtrees[10] }))); - - AssertThat(ts_stack_version_count(stack), Equals(2)); - AssertThat(ts_stack_state(stack, 0), Equals(stateI)); - AssertThat(ts_stack_state(stack, 1), Equals(stateA)); - - free_slice_array(&pool,&pop); - }); - }); - - describe("when there are three paths that lead to three different versions", [&]() { - it("returns three entries with different arrays of subtrees", [&]() { - // . <──0── A <──1── B <──2── C <──3── D <──10── I* - // ↑ | - // ├───4─── E <──5── F <──6───┘ - // | | - // └───7─── G <──8── H <──9───┘ - StackSliceArray pop = ts_stack_pop_count(stack, 0, 4); - free_slice_array(&pool,&pop); - push(1, subtrees[7], stateG); - push(1, subtrees[8], stateH); - push(1, subtrees[9], stateD); - push(1, subtrees[10], stateI); - ts_stack_merge(stack, 0, 1); - - AssertThat(ts_stack_version_count(stack), Equals(1)); - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateI, 0}, - {stateD, 1}, - {stateC, 2}, - {stateF, 2}, - {stateH, 2}, - {stateB, 3}, - {stateE, 3}, - {stateG, 3}, - {stateA, 4}, - {1, 5}, - }))); - - // . <──0── A <──1── B <──2── C <──3── D <──10── I* - // ↑ ↑ - // | └* - // | - // ├───4─── E <──5── F* - // | - // └───7─── G <──8── H* - pop = ts_stack_pop_count(stack, 0, 2); - AssertThat(pop.size, Equals(3)); - - StackSlice slice1 = pop.contents[0]; - AssertThat(slice1.version, Equals(1)); - AssertThat(slice1.subtrees, Equals(vector({ subtrees[3], subtrees[10] }))); - - StackSlice slice2 = pop.contents[1]; - AssertThat(slice2.version, Equals(2)); - AssertThat(slice2.subtrees, Equals(vector({ subtrees[6], subtrees[10] }))); - - StackSlice slice3 = pop.contents[2]; - AssertThat(slice3.version, Equals(3)); - AssertThat(slice3.subtrees, Equals(vector({ subtrees[9], subtrees[10] }))); - - AssertThat(ts_stack_version_count(stack), Equals(4)); - AssertThat(ts_stack_state(stack, 0), Equals(stateI)); - AssertThat(ts_stack_state(stack, 1), Equals(stateC)); - AssertThat(ts_stack_state(stack, 2), Equals(stateF)); - AssertThat(ts_stack_state(stack, 3), Equals(stateH)); - - free_slice_array(&pool,&pop); - }); - }); - }); - }); - - describe("pop_pending(version)", [&]() { - before_each([&]() { - push(0, subtrees[0], stateA); - }); - - it("removes the top node from the stack if it was pushed in pending mode", [&]() { - ts_stack_push(stack, 0, subtrees[1], true, stateB); - ts_subtree_retain(subtrees[1]); - - StackSliceArray pop = ts_stack_pop_pending(stack, 0); - AssertThat(pop.size, Equals(1)); - - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateA, 0}, - {1, 1}, - }))); - - free_slice_array(&pool,&pop); - }); - - it("skips entries whose subtrees are extra", [&]() { - ts_stack_push(stack, 0, subtrees[1], true, stateB); - ts_subtree_retain(subtrees[1]); - - mutate(subtrees[2])->extra = true; - mutate(subtrees[3])->extra = true; - - push(0, subtrees[2], stateB); - push(0, subtrees[3], stateB); - - StackSliceArray pop = ts_stack_pop_pending(stack, 0); - AssertThat(pop.size, Equals(1)); - - AssertThat(pop.contents[0].subtrees, Equals(vector({ subtrees[1], subtrees[2], subtrees[3] }))); - - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateA, 0}, - {1, 1}, - }))); - - free_slice_array(&pool,&pop); - }); - - it("does nothing if the top node was not pushed in pending mode", [&]() { - push(0, subtrees[1], stateB); - - StackSliceArray pop = ts_stack_pop_pending(stack, 0); - AssertThat(pop.size, Equals(0)); - - AssertThat(get_stack_entries(stack, 0), Equals(vector({ - {stateB, 0}, - {stateA, 1}, - {1, 2}, - }))); - - free_slice_array(&pool,&pop); - }); - }); - - describe("setting external token state", [&]() { - before_each([&]() { - mutate(subtrees[1])->has_external_tokens = true; - mutate(subtrees[2])->has_external_tokens = true; - ts_external_scanner_state_init(&mutate(subtrees[1])->external_scanner_state, NULL, 0); - ts_external_scanner_state_init(&mutate(subtrees[2])->external_scanner_state, NULL, 0); - }); - - it("allows the state to be retrieved", [&]() { - AssertThat(ts_stack_last_external_token(stack, 0).ptr, Equals(nullptr)); - - ts_stack_set_last_external_token(stack, 0, subtrees[1]); - AssertThat(ts_stack_last_external_token(stack, 0).ptr, Equals(subtrees[1].ptr)); - - ts_stack_copy_version(stack, 0); - AssertThat(ts_stack_last_external_token(stack, 1).ptr, Equals(subtrees[1].ptr)); - - ts_stack_set_last_external_token(stack, 0, subtrees[2]); - AssertThat(ts_stack_last_external_token(stack, 0).ptr, Equals(subtrees[2].ptr)); - }); - - it("does not merge stack versions with different external token states", [&]() { - ts_external_scanner_state_init(&mutate(subtrees[1])->external_scanner_state, "abcd", 2); - ts_external_scanner_state_init(&mutate(subtrees[2])->external_scanner_state, "ABCD", 2); - - ts_stack_copy_version(stack, 0); - push(0, subtrees[0], 5); - push(1, subtrees[0], 5); - - ts_stack_set_last_external_token(stack, 0, subtrees[1]); - ts_stack_set_last_external_token(stack, 1, subtrees[2]); - - AssertThat(ts_stack_merge(stack, 0, 1), IsFalse()); - }); - - it("merges stack versions with identical external token states", [&]() { - ts_external_scanner_state_init(&mutate(subtrees[1])->external_scanner_state, "abcd", 2); - ts_external_scanner_state_init(&mutate(subtrees[2])->external_scanner_state, "abcd", 2); - - ts_stack_copy_version(stack, 0); - push(0, subtrees[0], 5); - push(1, subtrees[0], 5); - - ts_stack_set_last_external_token(stack, 0, subtrees[1]); - ts_stack_set_last_external_token(stack, 1, subtrees[2]); - - AssertThat(ts_stack_merge(stack, 0, 1), IsTrue()); - }); - - it("does not distinguish between an *empty* external token state and *no* external token state", [&]() { - ts_stack_copy_version(stack, 0); - push(0, subtrees[0], 5); - push(1, subtrees[0], 5); - - ts_stack_set_last_external_token(stack, 0, subtrees[1]); - - AssertThat(ts_stack_merge(stack, 0, 1), IsTrue()); - }); - }); -}); - -END_TEST - -bool operator==(const StackEntry &left, const StackEntry &right) { - return left.state == right.state && left.depth == right.depth; -} - -std::ostream &operator<<(std::ostream &stream, const StackEntry &entry) { - return stream << "{" << entry.state << ", " << entry.depth << "}"; -} - -std::ostream &operator<<(std::ostream &stream, const SubtreeArray &array) { - stream << "["; - bool first = true; - for (size_t i = 0; i < array.size; i++) { - if (!first) - stream << ", "; - first = false; - stream << array.contents[i]; - } - return stream << "]"; -} diff --git a/test/runtime/subtree_test.cc b/test/runtime/subtree_test.cc deleted file mode 100644 index 3472aca3..00000000 --- a/test/runtime/subtree_test.cc +++ /dev/null @@ -1,501 +0,0 @@ -#include "test_helper.h" -#include "helpers/tree_helpers.h" -#include "helpers/point_helpers.h" -#include "runtime/subtree.h" -#include "runtime/length.h" - -void assert_consistent(Subtree tree) { - if (ts_subtree_child_count(tree) == 0) return; - AssertThat(tree.ptr->children[0].ptr->padding, Equals(tree.ptr->padding)); - - Length total_children_size = length_zero(); - for (size_t i = 0; i < tree.ptr->child_count; i++) { - Subtree child = tree.ptr->children[i]; - assert_consistent(child); - total_children_size = length_add(total_children_size, ts_subtree_total_size(child)); - } - - AssertThat(total_children_size, Equals(ts_subtree_total_size(tree))); -}; - -START_TEST; - -describe("Subtree", []() { - enum { - symbol1 = 1, - symbol2, - symbol3, - symbol4, - symbol5, - symbol6, - symbol7, - symbol8, - symbol9, - }; - - TSSymbolMetadata metadata_list[30] = {}; - - TSLanguage language; - language.symbol_metadata = metadata_list; - - SubtreePool pool; - - before_each([&]() { - pool = ts_subtree_pool_new(10); - }); - - after_each([&]() { - ts_subtree_pool_delete(&pool); - }); - - auto new_leaf = [&](TSSymbol symbol, Length padding, Length size, uint32_t lookahead_bytes) { - return ts_subtree_new_leaf( - &pool, symbol, padding, size, lookahead_bytes, 0, false, false, &language - ); - }; - - auto new_node = [&](TSSymbol symbol, vector children) { - return ts_subtree_from_mut(ts_subtree_new_node( - &pool, symbol, tree_array(children), 0, &language - )); - }; - - describe("new_node", [&]() { - Subtree tree1, tree2; - - before_each([&]() { - tree1 = new_leaf(symbol1, {2, {0, 1}}, {5, {0, 4}}, 0); - tree2 = new_leaf(symbol2, {1, {0, 1}}, {3, {0, 3}}, 0); - }); - - after_each([&]() { - ts_subtree_release(&pool, tree1); - ts_subtree_release(&pool, tree2); - }); - - it("computes its size and padding based on its child nodes", [&]() { - ts_subtree_retain(tree1); - ts_subtree_retain(tree2); - Subtree parent = new_node(symbol3, {tree1, tree2}); - - AssertThat( - ts_subtree_size(parent), - Equals( - ts_subtree_size(tree1) + ts_subtree_padding(tree2) + ts_subtree_size(tree2) - )); - AssertThat(ts_subtree_padding(parent), Equals(ts_subtree_padding(tree1))); - ts_subtree_release(&pool, parent); - }); - - describe("when the first node is fragile on the left side", [&]() { - it("records that it is fragile on the left side", [&]() { - MutableSubtree mutable_tree1 = ts_subtree_to_mut_unsafe(tree1); - mutable_tree1.ptr->fragile_left = true; - mutable_tree1.ptr->extra = true; - - ts_subtree_retain(tree1); - ts_subtree_retain(tree2); - Subtree parent = new_node(symbol3, {tree1, tree2}); - - AssertThat(ts_subtree_fragile_left(parent), IsTrue()); - AssertThat(ts_subtree_fragile_right(parent), IsFalse()); - ts_subtree_release(&pool, parent); - }); - }); - - describe("when the last node is fragile on the right side", [&]() { - it("records that it is fragile on the right side", [&]() { - MutableSubtree mutable_tree2 = ts_subtree_to_mut_unsafe(tree2); - mutable_tree2.ptr->fragile_right = true; - mutable_tree2.ptr->extra = true; - - ts_subtree_retain(tree1); - ts_subtree_retain(tree2); - Subtree parent = new_node(symbol3, {tree1, tree2}); - - AssertThat(ts_subtree_fragile_left(parent), IsFalse()); - AssertThat(ts_subtree_fragile_right(parent), IsTrue()); - ts_subtree_release(&pool, parent); - }); - }); - - describe("when the outer nodes aren't fragile on their outer side", [&]() { - it("records that it is not fragile", [&]() { - MutableSubtree mutable_tree1 = ts_subtree_to_mut_unsafe(tree1); - MutableSubtree mutable_tree2 = ts_subtree_to_mut_unsafe(tree2); - mutable_tree1.ptr->fragile_right = true; - mutable_tree2.ptr->fragile_left = true; - - ts_subtree_retain(tree1); - ts_subtree_retain(tree2); - Subtree parent = new_node(symbol3, {tree1, tree2}); - - AssertThat(ts_subtree_fragile_left(parent), IsFalse()); - AssertThat(ts_subtree_fragile_right(parent), IsFalse()); - ts_subtree_release(&pool, parent); - }); - }); - }); - - describe("edit", [&]() { - Subtree tree; - - before_each([&]() { - tree = new_node(symbol1, { - new_leaf(symbol2, {2, {0, 2}}, {3, {0, 3}}, 0), - new_leaf(symbol3, {2, {0, 2}}, {3, {0, 3}}, 0), - new_leaf(symbol4, {2, {0, 2}}, {3, {0, 3}}, 0), - }); - - AssertThat(tree.ptr->padding, Equals({2, {0, 2}})); - AssertThat(tree.ptr->size, Equals({13, {0, 13}})); - }); - - after_each([&]() { - ts_subtree_release(&pool, tree); - }); - - it("does not mutate the argument", [&]() { - TSInputEdit edit; - edit.start_byte = 1; - edit.old_end_byte = 1; - edit.new_end_byte = 2; - edit.start_point = {0, 1}; - edit.old_end_point = {0, 1}; - edit.new_end_point = {0, 2}; - - ts_subtree_retain(tree); - Subtree new_tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - assert_consistent(new_tree); - - AssertThat(ts_subtree_has_changes(tree), IsFalse()); - AssertThat(ts_subtree_padding(tree), Equals({2, {0, 2}})); - AssertThat(ts_subtree_size(tree), Equals({13, {0, 13}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsFalse()); - AssertThat(ts_subtree_padding(tree.ptr->children[0]), Equals({2, {0, 2}})); - AssertThat(ts_subtree_size(tree.ptr->children[0]), Equals({3, {0, 3}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[1]), IsFalse()); - AssertThat(ts_subtree_padding(tree.ptr->children[1]), Equals({2, {0, 2}})); - AssertThat(ts_subtree_size(tree.ptr->children[1]), Equals({3, {0, 3}})); - - ts_subtree_release(&pool, new_tree); - }); - - describe("edits within a tree's padding", [&]() { - it("resizes the padding of the tree and its leftmost descendants", [&]() { - TSInputEdit edit; - edit.start_byte = 1; - edit.old_end_byte = 1; - edit.new_end_byte = 2; - edit.start_point = {0, 1}; - edit.old_end_point = {0, 1}; - edit.new_end_point = {0, 2}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_has_changes(tree), IsTrue()); - AssertThat(ts_subtree_padding(tree), Equals({3, {0, 3}})); - AssertThat(ts_subtree_size(tree), Equals({13, {0, 13}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[0]), Equals({3, {0, 3}})); - AssertThat(ts_subtree_size(tree.ptr->children[0]), Equals({3, {0, 3}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[1]), IsFalse()); - AssertThat(ts_subtree_padding(tree.ptr->children[1]), Equals({2, {0, 2}})); - AssertThat(ts_subtree_size(tree.ptr->children[1]), Equals({3, {0, 3}})); - }); - }); - - describe("edits that start in a tree's padding but extend into its content", [&]() { - it("shrinks the content to compensate for the expanded padding", [&]() { - TSInputEdit edit; - edit.start_byte = 1; - edit.old_end_byte = 4; - edit.new_end_byte = 5; - edit.start_point = {0, 1}; - edit.old_end_point = {0, 4}; - edit.new_end_point = {0, 5}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_has_changes(tree), IsTrue()); - AssertThat(ts_subtree_padding(tree), Equals({5, {0, 5}})); - AssertThat(ts_subtree_size(tree), Equals({11, {0, 11}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[0]), Equals({5, {0, 5}})); - AssertThat(ts_subtree_size(tree.ptr->children[0]), Equals({1, {0, 1}})); - }); - }); - - describe("insertions at the edge of a tree's padding", [&]() { - it("expands the tree's padding", [&]() { - TSInputEdit edit; - edit.start_byte = 2; - edit.old_end_byte = 2; - edit.new_end_byte = 4; - edit.start_point = {0, 2}; - edit.old_end_point = {0, 2}; - edit.new_end_point = {0, 4}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_has_changes(tree), IsTrue()); - AssertThat(ts_subtree_padding(tree), Equals({4, {0, 4}})); - AssertThat(ts_subtree_size(tree), Equals({13, {0, 13}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[0]), Equals({4, {0, 4}})); - AssertThat(ts_subtree_size(tree.ptr->children[0]), Equals({3, {0, 3}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[1]), IsFalse()); - }); - }); - - describe("replacements starting at the edge of a tree's padding", [&]() { - it("resizes the content and not the padding", [&]() { - TSInputEdit edit; - edit.start_byte = 2; - edit.old_end_byte = 4; - edit.new_end_byte = 7; - edit.start_point = {0, 2}; - edit.old_end_point = {0, 4}; - edit.new_end_point = {0, 7}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_has_changes(tree), IsTrue()); - AssertThat(ts_subtree_padding(tree), Equals({2, {0, 2}})); - AssertThat(ts_subtree_size(tree), Equals({16, {0, 16}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[0]), Equals({2, {0, 2}})); - AssertThat(ts_subtree_size(tree.ptr->children[0]), Equals({6, {0, 6}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[1]), IsFalse()); - }); - }); - - describe("deletions that span more than one child node", [&]() { - it("shrinks subsequent child nodes", [&]() { - TSInputEdit edit; - edit.start_byte = 1; - edit.old_end_byte = 11; - edit.new_end_byte = 4; - edit.start_point = {0, 1}; - edit.old_end_point = {0, 11}; - edit.new_end_point = {0, 4}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_has_changes(tree), IsTrue()); - AssertThat(ts_subtree_padding(tree), Equals({4, {0, 4}})); - AssertThat(ts_subtree_size(tree), Equals({4, {0, 4}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[0]), Equals({4, {0, 4}})); - AssertThat(ts_subtree_size(tree.ptr->children[0]), Equals({0, {0, 0}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[1]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[1]), Equals({0, {0, 0}})); - AssertThat(ts_subtree_size(tree.ptr->children[1]), Equals({0, {0, 0}})); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[2]), IsTrue()); - AssertThat(ts_subtree_padding(tree.ptr->children[2]), Equals({1, {0, 1}})); - AssertThat(ts_subtree_size(tree.ptr->children[2]), Equals({3, {0, 3}})); - }); - }); - - describe("edits within a tree's range of scanned bytes", [&]() { - it("marks preceding trees as changed", [&]() { - MutableSubtree mutable_child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); - mutable_child.ptr->lookahead_bytes = 2; - - TSInputEdit edit; - edit.start_byte = 6; - edit.old_end_byte = 7; - edit.new_end_byte = 7; - edit.start_point = {0, 6}; - edit.old_end_point = {0, 7}; - edit.new_end_point = {0, 7}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_has_changes(tree.ptr->children[0]), IsTrue()); - }); - }); - - describe("insertions at the end of the tree", [&]() { - it("extends the tree's content", [&]() { - TSInputEdit edit; - edit.start_byte = 15; - edit.old_end_byte = 15; - edit.new_end_byte = 16; - edit.start_point = {0, 15}; - edit.old_end_point = {0, 15}; - edit.new_end_point = {0, 16}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_size(tree).bytes, Equals(14u)); - AssertThat(ts_subtree_has_changes(tree.ptr->children[2]), IsTrue()); - AssertThat(ts_subtree_size(tree.ptr->children[2]).bytes, Equals(4u)); - }); - }); - - describe("edits beyond the end of the tree", [&]() { - it("does not change the tree", [&]() { - TSInputEdit edit; - edit.start_byte = 15; - edit.old_end_byte = 16; - edit.new_end_byte = 17; - edit.start_point = {0, 15}; - edit.old_end_point = {0, 16}; - edit.new_end_point = {0, 17}; - - tree = ts_subtree_edit(tree, &edit, &pool); - assert_consistent(tree); - - AssertThat(ts_subtree_size(tree).bytes, Equals(13u)); - AssertThat(ts_subtree_size(tree.ptr->children[2]).bytes, Equals(3u)); - }); - }); - }); - - describe("eq", [&]() { - Subtree leaf; - - before_each([&]() { - leaf = new_leaf(symbol1, {2, {1, 1}}, {5, {1, 4}}, 0); - }); - - after_each([&]() { - ts_subtree_release(&pool, leaf); - }); - - it("returns true for identical trees", [&]() { - Subtree leaf_copy = new_leaf(symbol1, {2, {1, 1}}, {5, {1, 4}}, 0); - AssertThat(ts_subtree_eq(leaf, leaf_copy), IsTrue()); - - Subtree parent = new_node(symbol2, {leaf, leaf_copy}); - ts_subtree_retain(leaf); - ts_subtree_retain(leaf_copy); - - Subtree parent_copy = new_node(symbol2, {leaf, leaf_copy}); - ts_subtree_retain(leaf); - ts_subtree_retain(leaf_copy); - - AssertThat(ts_subtree_eq(parent, parent_copy), IsTrue()); - - ts_subtree_release(&pool, leaf_copy); - ts_subtree_release(&pool, parent); - ts_subtree_release(&pool, parent_copy); - }); - - it("returns false for trees with different symbols", [&]() { - Subtree different_leaf = new_leaf( - ts_subtree_symbol(leaf) + 1, - ts_subtree_padding(leaf), - ts_subtree_size(leaf), - ts_subtree_lookahead_bytes(leaf) - ); - - AssertThat(ts_subtree_eq(leaf, different_leaf), IsFalse()); - ts_subtree_release(&pool, different_leaf); - }); - - it("returns false for trees with different options", [&]() { - Subtree different_leaf = new_leaf( - ts_subtree_symbol(leaf), - ts_subtree_padding(leaf), - ts_subtree_size(leaf), - ts_subtree_lookahead_bytes(leaf) - ); - ts_subtree_to_mut_unsafe(different_leaf).ptr->visible = !ts_subtree_visible(leaf); - AssertThat(ts_subtree_eq(leaf, different_leaf), IsFalse()); - ts_subtree_release(&pool, different_leaf); - }); - - it("returns false for trees with different paddings or sizes", [&]() { - Subtree different_leaf = new_leaf( - ts_subtree_symbol(leaf), - {}, - ts_subtree_size(leaf), - ts_subtree_lookahead_bytes(leaf) - ); - AssertThat(ts_subtree_eq(leaf, different_leaf), IsFalse()); - ts_subtree_release(&pool, different_leaf); - - different_leaf = new_leaf(symbol1, ts_subtree_padding(leaf), {}, ts_subtree_lookahead_bytes(leaf)); - AssertThat(ts_subtree_eq(leaf, different_leaf), IsFalse()); - ts_subtree_release(&pool, different_leaf); - }); - - it("returns false for trees with different children", [&]() { - Subtree leaf2 = new_leaf(symbol2, {1, {0, 1}}, {3, {0, 3}}, 0); - Subtree parent = new_node(symbol2, {leaf, leaf2}); - ts_subtree_retain(leaf); - ts_subtree_retain(leaf2); - - Subtree different_parent = new_node(symbol2, {leaf2, leaf}); - ts_subtree_retain(leaf2); - ts_subtree_retain(leaf); - - AssertThat(ts_subtree_eq(different_parent, parent), IsFalse()); - AssertThat(ts_subtree_eq(parent, different_parent), IsFalse()); - - ts_subtree_release(&pool, leaf2); - ts_subtree_release(&pool, parent); - ts_subtree_release(&pool, different_parent); - }); - }); - - describe("last_external_token", [&]() { - Length padding = {1, {0, 1}}; - Length size = {2, {0, 2}}; - - auto make_external = [](Subtree tree) { - ts_external_scanner_state_init( - &ts_subtree_to_mut_unsafe(tree).ptr->external_scanner_state, - NULL, 0 - ); - return tree; - }; - - it("returns the last serialized external token state in the given tree", [&]() { - Subtree tree1, tree2, tree3, tree4, tree5, tree6, tree7, tree8, tree9; - - tree1 = new_node(symbol1, { - (tree2 = new_node(symbol2, { - (tree3 = make_external(ts_subtree_new_leaf(&pool, symbol3, padding, size, 0, 0, true, false, &language))), - (tree4 = new_leaf(symbol4, padding, size, 0)), - (tree5 = new_leaf(symbol5, padding, size, 0)), - })), - (tree6 = new_node(symbol6, { - (tree7 = new_node(symbol7, { - (tree8 = new_leaf(symbol8, padding, size, 0)), - })), - (tree9 = new_leaf(symbol9, padding, size, 0)), - })), - }); - - auto token = ts_subtree_last_external_token(tree1); - AssertThat(token.ptr, Equals(tree3.ptr)); - - ts_subtree_release(&pool, tree1); - }); - }); -}); - -END_TEST diff --git a/test/runtime/tree_test.cc b/test/runtime/tree_test.cc deleted file mode 100644 index 37e3b305..00000000 --- a/test/runtime/tree_test.cc +++ /dev/null @@ -1,244 +0,0 @@ -#include "test_helper.h" -#include -#include "runtime/alloc.h" -#include "helpers/record_alloc.h" -#include "helpers/stream_methods.h" -#include "helpers/tree_helpers.h" -#include "helpers/point_helpers.h" -#include "helpers/spy_logger.h" -#include "helpers/stderr_logger.h" -#include "helpers/spy_input.h" -#include "helpers/load_language.h" -#include "helpers/random_helpers.h" -#include "helpers/read_test_entries.h" -#include "helpers/tree_helpers.h" - -TSPoint point(uint32_t row, uint32_t column) { - TSPoint result = {row, column}; - return result; -} - -START_TEST - -describe("Tree", [&]() { - TSParser *parser; - SpyInput *input; - TSTree *tree; - - before_each([&]() { - record_alloc::start(true); - parser = ts_parser_new(); - tree = nullptr; - input = nullptr; - }); - - after_each([&]() { - if (tree) ts_tree_delete(tree); - if (input) delete input; - ts_parser_delete(parser); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - auto assert_root_node = [&](const string &expected) { - TSNode node = ts_tree_root_node(tree); - char *node_string = ts_node_string(node); - string actual(node_string); - ts_free(node_string); - AssertThat(actual, Equals(expected)); - }; - - describe("copy()", [&]() { - it("returns a tree that can be safely used while the current tree is edited", [&]() { - const TSLanguage *language = load_real_language("javascript"); - ts_parser_set_language(parser, language); - string source_code = examples_for_language("javascript")[0].input; - - input = new SpyInput(source_code, 32); - TSTree *original_tree = ts_parser_parse(parser, nullptr, input->input()); - - vector>> new_trees; - for (unsigned i = 0; i < 8; i++) { - TSTree *tree_copy = ts_tree_copy(original_tree); - new_trees.push_back(std::async([i, tree_copy, &source_code, language]() -> pair { - Generator random(TREE_SITTER_SEED + i); - - TSTree *tree = tree_copy; - TSParser *parser = ts_parser_new(); - ts_parser_set_language(parser, language); - SpyInput *input = new SpyInput(source_code, 1024); - - for (unsigned j = 0; j < 10; j++) { - random.sleep_some(); - - size_t edit_position = random(input->content.size()); - size_t deletion_size = random(input->content.size() - edit_position); - string inserted_text = random.words(random(4) + 1); - - TSInputEdit edit = input->replace(edit_position, deletion_size, inserted_text); - ts_tree_edit(tree, &edit); - - TSTree *new_tree = ts_parser_parse(parser, tree, input->input()); - ts_tree_delete(tree); - tree = new_tree; - } - - ts_parser_delete(parser); - return {tree, input}; - })); - } - - ts_tree_delete(original_tree); - - for (auto &future : new_trees) { - future.wait(); - auto result = future.get(); - TSTree *new_tree = result.first; - SpyInput *new_input = result.second; - assert_consistent_tree_sizes(new_tree, new_input->content); - ts_tree_delete(new_tree); - delete new_input; - } - }); - }); - - describe("get_changed_ranges()", [&]() { - before_each([&]() { - ts_parser_set_language(parser, load_real_language("javascript")); - input = new SpyInput("{a: null};\n", 3); - tree = ts_parser_parse(parser, nullptr, input->input()); - - assert_root_node( - "(program (expression_statement (object (pair (property_identifier) (null)))))" - ); - }); - - auto get_changed_ranges_for_edit = [&](function fn) -> vector { - TSInputEdit edit = fn(); - ts_tree_edit(tree, &edit); - - uint32_t range_count = 0; - TSTree *new_tree = ts_parser_parse(parser, tree, input->input()); - TSRange *ranges = ts_tree_get_changed_ranges(tree, new_tree, &range_count); - ts_tree_delete(tree); - tree = new_tree; - - vector result; - for (size_t i = 0; i < range_count; i++) { - result.push_back(ranges[i]); - } - - ts_free(ranges); - return result; - }; - - it("reports changes when one token has been updated", [&]() { - // Replace `null` with `nothing` - auto ranges = get_changed_ranges_for_edit([&]() { - return input->replace(input->content.find("ull"), 3, "othing"); - }); - AssertThat(ranges, Equals(vector({ - range_for_substring(input->content, "nothing"), - }))); - - // Replace `nothing` with `null` again - ranges = get_changed_ranges_for_edit([&]() { - return input->undo(); - }); - AssertThat(ranges, Equals(vector({ - range_for_substring(input->content, "null"), - }))); - }); - - it("reports no changes when leading whitespace has changed (regression)", [&]() { - input->chars_per_chunk = 80; - - // Insert leading whitespace - auto ranges = get_changed_ranges_for_edit([&]() { - return input->replace(0, 0, "\n"); - }); - assert_root_node( - "(program (expression_statement (object (pair (property_identifier) (null)))))" - ); - AssertThat(ranges, IsEmpty()); - - // Remove leading whitespace - ranges = get_changed_ranges_for_edit([&]() { - return input->undo(); - }); - assert_root_node( - "(program (expression_statement (object (pair (property_identifier) (null)))))" - ); - AssertThat(ranges, IsEmpty()); - - // Insert leading whitespace again - ranges = get_changed_ranges_for_edit([&]() { - return input->replace(0, 0, "\n"); - }); - assert_root_node( - "(program (expression_statement (object (pair (property_identifier) (null)))))" - ); - AssertThat(ranges, IsEmpty()); - }); - - it("reports changes when tokens have been appended", [&]() { - // Add a second key-value pair - auto ranges = get_changed_ranges_for_edit([&]() { - return input->replace(input->content.find("}"), 0, ", b: false"); - }); - AssertThat(ranges, Equals(vector({ - range_for_substring(input->content, ", b: false"), - }))); - - // Add a third key-value pair in between the first two - ranges = get_changed_ranges_for_edit([&]() { - return input->replace(input->content.find(", b"), 0, ", c: 1"); - }); - assert_root_node( - "(program (expression_statement (object " - "(pair (property_identifier) (null)) " - "(pair (property_identifier) (number)) " - "(pair (property_identifier) (false)))))" - ); - AssertThat(ranges, Equals(vector({ - range_for_substring(input->content, ", c: 1"), - }))); - - // Delete the middle pair. - ranges = get_changed_ranges_for_edit([&]() { - return input->undo(); - }); - assert_root_node( - "(program (expression_statement (object " - "(pair (property_identifier) (null)) " - "(pair (property_identifier) (false)))))" - ); - AssertThat(ranges, IsEmpty()); - - // Delete the second pair. - ranges = get_changed_ranges_for_edit([&]() { - return input->undo(); - }); - assert_root_node( - "(program (expression_statement (object " - "(pair (property_identifier) (null)))))" - ); - AssertThat(ranges, IsEmpty()); - }); - - it("reports changes when trees have been wrapped", [&]() { - // Wrap the object in an assignment expression. - auto ranges = get_changed_ranges_for_edit([&]() { - return input->replace(input->content.find("null"), 0, "b === "); - }); - assert_root_node( - "(program (expression_statement (object " - "(pair (property_identifier) (binary_expression (identifier) (null))))))" - ); - AssertThat(ranges, Equals(vector({ - range_for_substring(input->content, "b === null"), - }))); - }); - }); -}); - -END_TEST diff --git a/test/test_helper.h b/test/test_helper.h deleted file mode 100644 index 04fe1ffb..00000000 --- a/test/test_helper.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef TEST_HELPER_ -#define TEST_HELPER_ - -#include "bandit/bandit.h" -#include "tree_sitter/compiler.h" -#include "tree_sitter/runtime.h" - -extern int TREE_SITTER_SEED; - -namespace tree_sitter {} - -using namespace std; -using namespace bandit; -using namespace snowhouse; -using namespace tree_sitter; - -#define START_TEST go_bandit([]() { -#define END_TEST }); - -#define TREE_SITTER_TEST - -#endif // TEST_HELPER_ diff --git a/test/tests.cc b/test/tests.cc deleted file mode 100644 index bf2dba40..00000000 --- a/test/tests.cc +++ /dev/null @@ -1,17 +0,0 @@ -#include "test_helper.h" -#include "helpers/random_helpers.h" - -int TREE_SITTER_SEED = 0; - -int main(int argc, char *argv[]) { - const char *seed_env = getenv("TREE_SITTER_SEED"); - if (seed_env) { - TREE_SITTER_SEED = atoi(seed_env); - } else { - TREE_SITTER_SEED = get_time_as_seed(); - } - - printf("Random seed: %d\n", TREE_SITTER_SEED); - - return bandit::run(argc, argv); -} diff --git a/tests.gyp b/tests.gyp deleted file mode 100644 index 25932c7a..00000000 --- a/tests.gyp +++ /dev/null @@ -1,123 +0,0 @@ -{ - 'targets': [ - { - 'target_name': 'benchmarks', - 'default_configuration': 'Release', - 'type': 'executable', - 'dependencies': [ - 'project.gyp:runtime', - 'project.gyp:compiler' - ], - 'include_dirs': [ - 'src', - 'test', - 'externals/utf8proc', - ], - 'sources': [ - 'test/benchmarks.cc', - 'test/helpers/file_helpers.cc', - 'test/helpers/load_language.cc', - 'test/helpers/read_test_entries.cc', - 'test/helpers/stderr_logger.cc', - ], - }, - - { - 'target_name': 'tests', - 'default_configuration': 'Test', - 'type': 'executable', - 'dependencies': [ - 'project.gyp:runtime', - 'project.gyp:compiler' - ], - 'include_dirs': [ - 'src', - 'test', - 'externals/bandit', - 'externals/utf8proc', - 'externals/crypto-algorithms', - ], - 'sources': [ - 'test/compiler/build_tables/lex_item_test.cc', - 'test/compiler/build_tables/parse_item_set_builder_test.cc', - 'test/compiler/build_tables/rule_can_be_blank_test.cc', - 'test/compiler/prepare_grammar/expand_repeats_test.cc', - 'test/compiler/prepare_grammar/expand_tokens_test.cc', - 'test/compiler/prepare_grammar/extract_choices_test.cc', - 'test/compiler/prepare_grammar/extract_tokens_test.cc', - 'test/compiler/prepare_grammar/flatten_grammar_test.cc', - 'test/compiler/prepare_grammar/intern_symbols_test.cc', - 'test/compiler/prepare_grammar/parse_regex_test.cc', - 'test/compiler/rules/character_set_test.cc', - 'test/compiler/rules/rule_test.cc', - 'test/compiler/util/string_helpers_test.cc', - 'test/helpers/file_helpers.cc', - 'test/helpers/load_language.cc', - 'test/helpers/point_helpers.cc', - 'test/helpers/random_helpers.cc', - 'test/helpers/read_test_entries.cc', - 'test/helpers/record_alloc.cc', - 'test/helpers/scope_sequence.cc', - 'test/helpers/spy_input.cc', - 'test/helpers/spy_logger.cc', - 'test/helpers/stderr_logger.cc', - 'test/helpers/stream_methods.cc', - 'test/helpers/tree_helpers.cc', - 'test/integration/fuzzing-examples.cc', - 'test/integration/real_grammars.cc', - 'test/integration/test_grammars.cc', - 'test/runtime/language_test.cc', - 'test/runtime/node_test.cc', - 'test/runtime/parser_test.cc', - 'test/runtime/stack_test.cc', - 'test/runtime/subtree_test.cc', - 'test/runtime/tree_test.cc', - 'test/tests.cc', - ], - 'cflags': [ - '-g', - '-O0', - '-Wall', - '-Wextra', - '-Wno-unused-parameter', - '-Wno-unknown-pragmas', - ], - 'ldflags': ['-g'], - 'xcode_settings': { - 'ARCHS': ['x86_64'], - 'OTHER_LDFLAGS': ['-g'], - 'OTHER_CPLUSPLUSFLAGS': ['-fsanitize=address'], - 'GCC_OPTIMIZATION_LEVEL': '0', - 'WARNING_CFLAGS': [ - '-Wall', - '-Wextra', - '-Wno-unused-parameter' - ], - }, - } - ], - - 'target_defaults': { - 'configurations': {'Test': {}, 'Release': {}}, - - 'cflags_cc': ['-std=c++14'], - - 'conditions': [ - ['OS=="linux"', { - 'libraries': ['-ldl', '-lpthread'], - }], - - # For 64-bit builds on appveyor, we need to explicitly tell gyp - # to generate an x64 target in the MSVS project file. - ['"