Reorganize repo, add rust CLI and binding code,

This commit is contained in:
Max Brunsfeld 2019-01-04 17:31:49 -08:00
commit 47607cecf4
221 changed files with 11359 additions and 18038 deletions

View file

@ -1,26 +1,27 @@
image: Visual Studio 2017
environment:
TREE_SITTER_TEST: true
build: false
install:
- git submodule update --init --recursive
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain stable
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- rustc -vV
- cargo -vV
- script\fetch-test-fixtures.cmd
test_script:
- cargo build
- cargo test
branches:
only:
- master
platform:
- x86
- x64
init:
- git config --global core.autocrlf false
install:
- IF "%PLATFORM%" == "x86" (call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars32.bat")
- IF "%PLATFORM%" == "x64" (call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat")
- script\configure.cmd
- script\fetch-fixtures.cmd
test_script:
- script\test.cmd
build: off
cache:
- test\fixtures\grammars
- test\fixtures
- C:\Users\appveyor\.cargo

View file

@ -1,65 +0,0 @@
---
Language: Cpp
AccessModifierOffset: -1
AlignAfterOpenBracket: true
AlignConsecutiveAssignments: false
AlignEscapedNewlinesLeft: true
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 2
Cpp11BracedListStyle: false
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 60
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 20
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Auto
TabWidth: 8
UseTab: Never
...

View file

@ -1,8 +0,0 @@
-std=c++14
-Isrc
-Itest
-Iinclude
-Iexternals/utf8proc
-Iexternals/json-parser
-Iexternals/bandit
-Iexternals/crypto-algorithms

23
.gitignore vendored
View file

@ -1,22 +1,11 @@
# Compiled binaries
out
*.a
*.o
fuzz-results
log.html
# Generated build config files
gyp-mac-tool
Makefile
*.Makefile
*.target.mk
# IDE files
.idea
*.xcodeproj
# Dev dependencies
*.a
*.o
fuzz-results
test/fixtures/grammars/*
!test/fixtures/grammars/.gitkeep
externals/cpplint.py
/target
**/*.rs.bk

14
.gitmodules vendored
View file

@ -1,15 +1,3 @@
[submodule "externals/bandit"]
path = externals/bandit
url = https://github.com/joakimkarlsson/bandit.git
[submodule "externals/gyp"]
path = externals/gyp
url = https://github.com/svn2github/gyp.git
[submodule "externals/utf8proc"]
path = externals/utf8proc
path = lib/utf8proc
url = https://github.com/julialang/utf8proc
[submodule "externals/json-parser"]
path = externals/json-parser
url = https://github.com/udp/json-parser.git
[submodule "externals/crypto-algorithms"]
path = externals/crypto-algorithms
url = https://github.com/maxbrunsfeld/crypto-algorithms.git

View file

@ -1,27 +1,13 @@
sudo: false
dist: trusty
language: cpp
compiler:
- gcc
language: rust
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
- clang
rust:
- stable
install:
- export CXX="g++-5"
- scan-build script/configure
env:
- TREE_SITTER_TEST=1
script:
- script/ci
cache:
directories:
- test/fixtures/grammars
before_install:
- ./script/fetch-test-fixtures.sh
branches:
only:

837
Cargo.lock generated Normal file
View file

@ -0,0 +1,837 @@
[[package]]
name = "aho-corasick"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ansi_term"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "argon2rs"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)",
"scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "arrayvec"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "atty"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "backtrace"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "backtrace-sys"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bitflags"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "blake2-rfc"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "byteorder"
version = "1.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "cc"
version = "1.0.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "cfg-if"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "clap"
version = "2.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
"atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cloudabi"
version = "0.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "constant_time_eq"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "crossbeam-channel"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
"parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-epoch"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-utils"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "crossbeam-utils"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "dirs"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "failure"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
"failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "failure_derive"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)",
"synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fnv"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "fuchsia-zircon"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fuchsia-zircon-sys"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "globset"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)",
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "hashbrown"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
"scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ignore"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
"globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "indexmap"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "itoa"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "lazy_static"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libc"
version = "0.2.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libloading"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "libsqlite3-sys"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
"vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "linked-hash-map"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "lock_api"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "log"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "lru-cache"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "memchr"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "memoffset"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "nodrop"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "owning_ref"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "parking_lot"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "parking_lot_core"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "pkg-config"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "proc-macro2"
version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "quote"
version = "0.6.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
"fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand_core"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand_core"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "redox_syscall"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "redox_termios"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "redox_users"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)",
"failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rusqlite"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)",
"lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rustc-demangle"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ryu"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "same-file"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "scoped_threadpool"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "scopeguard"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "semver"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "semver-parser"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde"
version = "1.0.80"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde_derive"
version = "1.0.80"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "serde_json"
version = "1.0.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
"ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "smallbitvec"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "smallvec"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "stable_deref_trait"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "strsim"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "syn"
version = "0.15.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "synstructure"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "termion"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "textwrap"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread_local"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "time"
version = "0.1.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tree-sitter"
version = "0.3.5"
dependencies = [
"cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tree-sitter-cli"
version = "0.1.0"
dependencies = [
"clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
"dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
"ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
"rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)",
"smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"tree-sitter 0.3.5",
]
[[package]]
name = "ucd-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicode-width"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicode-xid"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unreachable"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "utf8-ranges"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "vcpkg"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "vec_map"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "version_check"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "void"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "walkdir"
version = "2.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-util"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1e9a933f4e58658d7b12defcf96dc5c720f20832deebe3e0a19efd3b6aaeeb9e"
"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
"checksum argon2rs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3f67b0b6a86dae6e67ff4ca2b6201396074996379fba2b92ff649126f37cb392"
"checksum arrayvec 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "f405cc4c21cd8b784f6c8fc2adf9bc00f59558f0049b5ec21517f875963040cc"
"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652"
"checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a"
"checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0"
"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12"
"checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400"
"checksum byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "94f88df23a25417badc922ab0f5716cc1330e87f71ddd9203b3a3ccd9cedf75d"
"checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16"
"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4"
"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ff012e225ce166d4422e0e78419d901719760f62ae2b7969ca6b564d1b54a9e"
"checksum crossbeam-channel 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b85741761b7f160bc5e7e0c14986ef685b7f8bf9b7ad081c60c604bb4649827"
"checksum crossbeam-epoch 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2449aaa4ec7ef96e5fb24db16024b935df718e9ae1cec0a1e68feeca2efca7b8"
"checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015"
"checksum crossbeam-utils 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c55913cc2799171a550e307918c0a360e8c16004820291bf3b638969b4a01816"
"checksum dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88972de891f6118092b643d85a0b28e0678e0f948d7f879aa32f2d5aafe97d2a"
"checksum failure 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6dd377bcc1b1b7ce911967e3ec24fa19c3224394ec05b54aa7b083d498341ac7"
"checksum failure_derive 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "64c2d913fe8ed3b6c6518eedf4538255b989945c14c2a7d5cbff62a5e2120596"
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
"checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865"
"checksum hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "64b7d419d0622ae02fe5da6b9a5e1964b610a65bb37923b976aeebb6dbb8f86e"
"checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01"
"checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d"
"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b"
"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1"
"checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311"
"checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2"
"checksum libsqlite3-sys 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d3711dfd91a1081d2458ad2d06ea30a8755256e74038be2ad927d94e1c955ca8"
"checksum linked-hash-map 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7860ec297f7008ff7a1e3382d7f7e1dcd69efc94751a2284bafc3d013c2aa939"
"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c"
"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6"
"checksum lru-cache 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4d06ff7ff06f729ce5f4e227876cb88d10bc59cd4ae1e09fbb2bde15c850dc21"
"checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16"
"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3"
"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945"
"checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13"
"checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5"
"checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c"
"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c"
"checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09"
"checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c"
"checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd"
"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c"
"checksum rand_core 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1961a422c4d189dfb50ffa9320bf1f2a9bd54ecb92792fb9477f99a1045f3372"
"checksum rand_core 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0905b6b7079ec73b314d4c748701f6931eb79fd97c668caa3f1899b22b32c6db"
"checksum redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)" = "679da7508e9a6390aeaf7fbd02a800fdc64b73fe2204dd2c8ae66d22d9d5ad5d"
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
"checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26"
"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f"
"checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1"
"checksum rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c9d9118f1ce84d8d0b67f9779936432fb42bb620cef2122409d786892cce9a3c"
"checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395"
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7"
"checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267"
"checksum scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8"
"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27"
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
"checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef"
"checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c"
"checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811"
"checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e"
"checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db"
"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8"
"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550"
"checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7"
"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015"
"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096"
"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6"
"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
"checksum time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "d825be0eb33fda1a7e68012d51e9c7f451dc1a69391e7fdc197060bb8c56667b"
"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86"
"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56"
"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737"
"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d"
"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1"
"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "afc5508759c5bf4285e61feb862b6083c8480aec864fa17a81fdec6f69b461ab"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

6
Cargo.toml Normal file
View file

@ -0,0 +1,6 @@
[workspace]
members = [
"cli",
"lib",
]

22
LICENSE
View file

@ -1,7 +1,21 @@
Copyright 2014 Max Brunsfeld
The MIT License (MIT)
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
Copyright (c) 2018 Max Brunsfeld
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

33
cli/Cargo.toml Normal file
View file

@ -0,0 +1,33 @@
[package]
name = "tree-sitter-cli"
version = "0.1.0"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2018"
[[bin]]
name = "tree-sitter"
path = "src/main.rs"
[dependencies]
lazy_static = "1.2.0"
smallbitvec = "2.3.0"
clap = "2.32"
dirs = "1.0.2"
hashbrown = "0.1"
ignore = "0.4.4"
libloading = "0.5"
rusqlite = "0.14.0"
serde = "1.0"
serde_derive = "1.0"
regex-syntax = "0.6.4"
[dependencies.tree-sitter]
path = "../lib"
[dependencies.serde_json]
version = "1.0"
features = ["preserve_order"]
[dependencies.log]
version = "0.4.6"
features = ["std"]

View file

@ -0,0 +1,278 @@
use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use crate::rules::Symbol;
use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable};
use std::collections::hash_map::Entry;
use std::collections::{BTreeMap, HashMap, VecDeque};
pub(crate) fn build_lex_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
keywords: &LookaheadSet,
minimize: bool,
) -> (LexTable, LexTable) {
let keyword_lex_table;
if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords);
keyword_lex_table = builder.table;
} else {
keyword_lex_table = LexTable::default();
}
let mut builder = LexTableBuilder::new(lexical_grammar);
for state in parse_table.states.iter_mut() {
let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| {
if token.is_terminal() {
if keywords.contains(&token) {
syntax_grammar.word_token
} else {
Some(*token)
}
} else if token.is_eof() {
Some(*token)
} else {
None
}
}));
state.lex_state_id = builder.add_state_for_tokens(&tokens);
}
let mut table = builder.table;
if minimize {
minimize_lex_table(&mut table, parse_table);
}
(table, keyword_lex_table)
}
struct QueueEntry {
state_id: usize,
nfa_states: Vec<u32>,
eof_valid: bool,
}
struct LexTableBuilder<'a> {
lexical_grammar: &'a LexicalGrammar,
cursor: NfaCursor<'a>,
table: LexTable,
state_queue: VecDeque<QueueEntry>,
state_ids_by_nfa_state_set: HashMap<(Vec<u32>, bool), usize>,
}
impl<'a> LexTableBuilder<'a> {
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
Self {
lexical_grammar,
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
table: LexTable::default(),
state_queue: VecDeque::new(),
state_ids_by_nfa_state_set: HashMap::new(),
}
}
fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize {
let mut eof_valid = false;
let nfa_states = tokens
.iter()
.filter_map(|token| {
if token.is_terminal() {
Some(self.lexical_grammar.variables[token.index].start_state)
} else {
eof_valid = true;
None
}
})
.collect();
let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
if is_new {
info!(
"entry point state: {}, tokens: {:?}",
state_id,
tokens
.iter()
.map(|t| &self.lexical_grammar.variables[t.index].name)
.collect::<Vec<_>>()
);
}
while let Some(QueueEntry {
state_id,
nfa_states,
eof_valid,
}) = self.state_queue.pop_front()
{
self.populate_state(state_id, nfa_states, eof_valid);
}
state_id
}
fn add_state(&mut self, nfa_states: Vec<u32>, eof_valid: bool) -> (usize, bool) {
self.cursor.reset(nfa_states);
match self
.state_ids_by_nfa_state_set
.entry((self.cursor.state_ids.clone(), eof_valid))
{
Entry::Occupied(o) => (*o.get(), false),
Entry::Vacant(v) => {
let state_id = self.table.states.len();
self.table.states.push(LexState::default());
self.state_queue.push_back(QueueEntry {
state_id,
nfa_states: v.key().0.clone(),
eof_valid,
});
v.insert(state_id);
(state_id, true)
}
}
}
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>, eof_valid: bool) {
self.cursor.force_reset(nfa_states);
// The EOF state is represented as an empty list of NFA states.
let mut completion = None;
for (id, prec) in self.cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if TokenConflictMap::prefer_token(
self.lexical_grammar,
(prev_precedence, prev_id),
(prec, id),
) {
continue;
}
}
completion = Some((id, prec));
}
info!(
"lex state: {}, completion: {:?}",
state_id,
completion.map(|(id, prec)| (&self.lexical_grammar.variables[id].name, prec))
);
let transitions = self.cursor.transitions();
info!("lex state: {}, transitions: {:?}", state_id, transitions);
// If EOF is a valid lookahead token, add a transition predicated on the null
// character that leads to the empty set of NFA states.
if eof_valid {
let (next_state_id, _) = self.add_state(Vec::new(), false);
info!("lex state: {}, successor: EOF", state_id);
self.table.states[state_id].advance_actions.push((
CharacterSet::empty().add_char('\0'),
AdvanceAction {
state: Some(next_state_id),
in_main_token: true,
},
));
}
for NfaTransition {
characters,
precedence,
states,
is_separator,
} in transitions
{
if let Some((_, completed_precedence)) = completion {
if precedence < completed_precedence
|| (precedence == completed_precedence && is_separator)
{
continue;
}
}
let (next_state_id, _) = self.add_state(states, eof_valid && is_separator);
let next_state = if next_state_id == state_id {
None
} else {
Some(next_state_id)
};
self.table.states[state_id].advance_actions.push((
characters,
AdvanceAction {
state: next_state,
in_main_token: !is_separator,
},
));
}
if let Some((complete_id, _)) = completion {
self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id));
} else if self.cursor.state_ids.is_empty() {
self.table.states[state_id].accept_action = Some(Symbol::end());
}
}
}
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
let mut state_replacements = BTreeMap::new();
let mut done = false;
while !done {
done = true;
for (i, state_i) in table.states.iter().enumerate() {
if state_replacements.contains_key(&i) {
continue;
}
for (j, state_j) in table.states.iter().enumerate() {
if j == i {
break;
}
if state_replacements.contains_key(&j) {
continue;
}
if state_i == state_j {
info!("replace state {} with state {}", i, j);
state_replacements.insert(i, j);
done = false;
break;
}
}
}
for state in table.states.iter_mut() {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = advance_action
.state
.map(|s| state_replacements.get(&s).cloned().unwrap_or(s))
}
}
}
let final_state_replacements = (0..table.states.len())
.into_iter()
.map(|state_id| {
let replacement = state_replacements
.get(&state_id)
.cloned()
.unwrap_or(state_id);
let prior_removed = state_replacements
.iter()
.take_while(|i| *i.0 < replacement)
.count();
replacement - prior_removed
})
.collect::<Vec<_>>();
for state in parse_table.states.iter_mut() {
state.lex_state_id = final_state_replacements[state.lex_state_id];
}
for state in table.states.iter_mut() {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = advance_action.state.map(|s| final_state_replacements[s]);
}
}
let mut i = 0;
table.states.retain(|_| {
let result = !state_replacements.contains_key(&i);
i += 1;
result
});
}

View file

@ -0,0 +1,735 @@
use super::item::{LookaheadSet, ParseItem, ParseItemSet};
use super::item_set_builder::ParseItemSetBuilder;
use crate::error::{Error, Result};
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType};
use crate::rules::{Alias, Associativity, Symbol, SymbolType};
use crate::tables::{
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use hashbrown::hash_map::Entry;
use hashbrown::{HashMap, HashSet};
use std::collections::hash_map::DefaultHasher;
use std::collections::VecDeque;
use std::fmt::Write;
use std::hash::Hasher;
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
struct ParseStateQueueEntry {
preceding_symbols: SymbolSequence,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId,
}
struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder<'a>,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
following_tokens: Vec<LookaheadSet>,
state_ids_to_log: Vec<ParseStateId>,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<(ParseTable, Vec<LookaheadSet>)> {
// Ensure that the empty alias sequence has index 0.
self.parse_table.alias_sequences.push(Vec::new());
// Add the error state at index 0.
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
// Add the starting state at index 1.
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(
ParseItem::start(),
LookaheadSet::with([Symbol::end()].iter().cloned()),
)]
.iter()
.cloned(),
),
);
while let Some(entry) = self.parse_state_queue.pop_front() {
let item_set = self
.item_set_builder
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
if self.state_ids_to_log.contains(&entry.state_id) {
eprintln!(
"state: {}\n\ninitial item set:\n\n{}closed item set:\n\n{}",
entry.state_id,
super::item::ParseItemSetDisplay(
&self.item_sets_by_state_id[entry.state_id],
self.syntax_grammar,
self.lexical_grammar,
),
super::item::ParseItemSetDisplay(
&item_set,
self.syntax_grammar,
self.lexical_grammar,
)
);
}
self.add_actions(
entry.preceding_symbols,
entry.preceding_auxiliary_symbols,
entry.state_id,
item_set,
)?;
}
self.populate_used_symbols();
self.remove_precedences();
Ok((self.parse_table, self.following_tokens))
}
fn add_parse_state(
&mut self,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
) -> ParseStateId {
if preceding_symbols.len() > 1 {
let left_tokens = self
.item_set_builder
.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
let right_tokens = self
.item_set_builder
.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
self.following_tokens[left_token.index].insert_all(right_tokens);
}
}
}
let mut hasher = DefaultHasher::new();
item_set.hash_unfinished_items(&mut hasher);
let unfinished_item_signature = hasher.finish();
match self.state_ids_by_item_set.entry(item_set) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let state_id = self.parse_table.states.len();
self.item_sets_by_state_id.push(v.key().clone());
self.parse_table.states.push(ParseState {
lex_state_id: 0,
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
unfinished_item_signature,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_symbols: preceding_symbols.clone(),
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
state_id
}
}
}
fn add_actions(
&mut self,
mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
state_id: ParseStateId,
item_set: ParseItemSet<'a>,
) -> Result<()> {
let mut terminal_successors = HashMap::new();
let mut non_terminal_successors = HashMap::new();
let mut lookaheads_with_conflicts = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
if next_symbol.is_non_terminal() {
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
}
} else {
let action = if item.is_augmented() {
ParseAction::Accept
} else {
ParseAction::Reduce {
symbol: Symbol::non_terminal(item.variable_index as usize),
child_count: item.step_index as usize,
precedence: item.precedence(),
associativity: item.associativity(),
dynamic_precedence: item.production.dynamic_precedence,
alias_sequence_id: self.get_alias_sequence_id(item),
}
};
for lookahead in lookaheads.iter() {
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
entry.actions.clear();
entry.actions.push(action);
lookaheads_with_conflicts.remove(&lookahead);
} else if action.precedence() == entry.actions[0].precedence() {
entry.actions.push(action);
lookaheads_with_conflicts.insert(lookahead);
}
}
}
}
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(symbol);
if let Entry::Occupied(e) = &entry {
if !e.get().actions.is_empty() {
lookaheads_with_conflicts.insert(symbol);
}
}
entry
.or_insert_with(|| ParseTableEntry::new())
.actions
.push(ParseAction::Shift {
state: next_state_id,
is_repetition: false,
});
}
for (symbol, next_item_set) in non_terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
}
for symbol in lookaheads_with_conflicts {
self.handle_conflict(
&item_set,
state_id,
&preceding_symbols,
&preceding_auxiliary_symbols,
symbol,
)?;
}
let state = &mut self.parse_table.states[state_id];
for extra_token in &self.syntax_grammar.extra_tokens {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
Ok(())
}
fn handle_conflict(
&mut self,
item_set: &ParseItemSet,
state_id: ParseStateId,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
conflicting_lookahead: Symbol,
) -> Result<()> {
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
// Determine which items in the set conflict with each other, and the
// precedences associated with SHIFT vs REDUCE actions. There won't
// be multiple REDUCE actions with different precedences; that is
// sorted out ahead of time in `add_actions`. But there can still be
// REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts.
let reduce_precedence = entry.actions[0].precedence();
let mut considered_associativity = false;
let mut shift_precedence: Option<Range<i32>> = None;
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(step) = item.step() {
if item.step_index > 0 {
if self
.item_set_builder
.first_set(&step.symbol)
.contains(&conflicting_lookahead)
{
conflicting_items.insert(item);
let precedence = item.precedence();
if let Some(range) = &mut shift_precedence {
if precedence < range.start {
range.start = precedence;
} else if precedence > range.end {
range.end = precedence;
}
} else {
shift_precedence = Some(precedence..precedence);
}
}
}
} else if lookaheads.contains(&conflicting_lookahead) {
conflicting_items.insert(item);
}
}
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
let shift_precedence = shift_precedence.unwrap_or(0..0);
// If all of the items in the conflict have the same parent symbol,
// and that parent symbols is auxiliary, then this is just the intentional
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
// by leaving it in the parse table, but marking the SHIFT action with
// an `is_repetition` flag.
let conflicting_variable_index =
conflicting_items.iter().next().unwrap().variable_index;
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() {
if conflicting_items
.iter()
.all(|item| item.variable_index == conflicting_variable_index)
{
*is_repetition = true;
return Ok(());
}
}
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
if shift_precedence.start > reduce_precedence
|| (shift_precedence.start == reduce_precedence
&& shift_precedence.end > reduce_precedence)
{
entry.actions.drain(0..entry.actions.len() - 1);
}
// If the REDUCE actions have higher precedence, remove the SHIFT action.
else if shift_precedence.end < reduce_precedence
|| (shift_precedence.end == reduce_precedence
&& shift_precedence.start < reduce_precedence)
{
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
// If the SHIFT and REDUCE actions have the same predence, consider
// the REDUCE actions' associativity.
else if shift_precedence == (reduce_precedence..reduce_precedence) {
considered_associativity = true;
let mut has_left = false;
let mut has_right = false;
let mut has_non = false;
for action in &entry.actions {
if let ParseAction::Reduce { associativity, .. } = action {
match associativity {
Some(Associativity::Left) => has_left = true,
Some(Associativity::Right) => has_right = true,
None => has_non = true,
}
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
(false, false, true) => {
entry.actions.drain(0..entry.actions.len() - 1);
}
_ => {}
}
}
}
// If all of the actions but one have been eliminated, then there's no problem.
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
if entry.actions.len() == 1 {
return Ok(());
}
// Determine the set of parent symbols involved in this conflict.
let mut actual_conflict = Vec::new();
for item in &conflicting_items {
let symbol = Symbol::non_terminal(item.variable_index as usize);
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
actual_conflict.extend(
preceding_auxiliary_symbols
.iter()
.rev()
.find_map(|info| {
if info.auxiliary_symbol == symbol {
Some(&info.parent_symbols)
} else {
None
}
})
.unwrap()
.iter(),
);
} else {
actual_conflict.push(symbol);
}
}
actual_conflict.sort_unstable();
actual_conflict.dedup();
// If this set of symbols has been whitelisted, then there's no error.
if self
.syntax_grammar
.expected_conflicts
.contains(&actual_conflict)
{
return Ok(());
}
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols {
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
}
write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n\n").unwrap();
for (i, item) in conflicting_items.iter().enumerate() {
write!(&mut msg, " {}:", i + 1).unwrap();
for preceding_symbol in preceding_symbols
.iter()
.take(preceding_symbols.len() - item.step_index as usize)
{
write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
write!(
&mut msg,
" ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
for (j, step) in item.production.steps.iter().enumerate() {
if j as u32 == item.step_index {
write!(&mut msg, "").unwrap();
}
write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap();
}
write!(&mut msg, ")").unwrap();
if item.is_done() {
write!(
&mut msg,
" • {}",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
if precedence != 0 || associativity.is_some() {
write!(
&mut msg,
"(precedence: {}, associativity: {:?})",
precedence, associativity
)
.unwrap();
}
write!(&mut msg, "\n").unwrap();
}
let mut resolution_count = 0;
write!(&mut msg, "\nPossible resolutions:\n\n").unwrap();
let shift_items = conflicting_items
.iter()
.filter(|i| !i.is_done())
.cloned()
.collect::<Vec<_>>();
if shift_items.len() > 0 {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a higher precedence in",
resolution_count
)
.unwrap();
for (i, item) in shift_items.iter().enumerate() {
if i > 0 {
write!(&mut msg, " and").unwrap();
}
write!(
&mut msg,
" `{}`",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
write!(&mut msg, " than in the other rules.\n").unwrap();
}
if considered_associativity {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a left or right associativity in ",
resolution_count
)
.unwrap();
for (i, item) in conflicting_items.iter().filter(|i| i.is_done()).enumerate() {
if i > 0 {
write!(&mut msg, " and ").unwrap();
}
write!(
&mut msg,
"{}",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
write!(&mut msg, "\n").unwrap();
}
for item in &conflicting_items {
if item.is_done() {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a higher precedence in `{}` than in the other rules.\n",
resolution_count,
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
}
resolution_count += 1;
write!(
&mut msg,
" {}: Add a conflict for these rules: ",
resolution_count
)
.unwrap();
for (i, symbol) in actual_conflict.iter().enumerate() {
if i > 0 {
write!(&mut msg, ", ").unwrap();
}
write!(&mut msg, "{}", self.symbol_name(symbol)).unwrap();
}
write!(&mut msg, "\n").unwrap();
Err(Error(msg))
}
fn get_auxiliary_node_info(
&self,
item_set: &ParseItemSet,
symbol: Symbol,
) -> AuxiliarySymbolInfo {
let parent_symbols = item_set
.entries
.keys()
.filter_map(|item| {
let variable_index = item.variable_index as usize;
if item.symbol() == Some(symbol)
&& !self.syntax_grammar.variables[variable_index].is_auxiliary()
{
Some(Symbol::non_terminal(variable_index))
} else {
None
}
})
.collect();
AuxiliarySymbolInfo {
auxiliary_symbol: symbol,
parent_symbols,
}
}
fn populate_used_symbols(&mut self) {
let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()];
let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()];
for state in &self.parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::external(i));
}
}
self.parse_table.symbols.push(Symbol::end());
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::terminal(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::non_terminal(i));
}
}
}
fn remove_precedences(&mut self) {
for state in self.parse_table.states.iter_mut() {
for (_, entry) in state.terminal_entries.iter_mut() {
for action in entry.actions.iter_mut() {
match action {
ParseAction::Reduce {
precedence,
associativity,
..
} => {
*precedence = 0;
*associativity = None;
}
_ => {}
}
}
}
}
}
fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId {
let mut alias_sequence: Vec<Option<Alias>> = item
.production
.steps
.iter()
.map(|s| s.alias.clone())
.collect();
while alias_sequence.last() == Some(&None) {
alias_sequence.pop();
}
if item.production.steps.len() > self.parse_table.max_aliased_production_length {
self.parse_table.max_aliased_production_length = item.production.steps.len()
}
if let Some(index) = self
.parse_table
.alias_sequences
.iter()
.position(|seq| *seq == alias_sequence)
{
index
} else {
self.parse_table.alias_sequences.push(alias_sequence);
self.parse_table.alias_sequences.len() - 1
}
}
fn symbol_name(&self, symbol: &Symbol) -> String {
match symbol.kind {
SymbolType::End => "EOF".to_string(),
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
.name
.clone(),
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
if variable.kind == VariableType::Named {
variable.name.clone()
} else {
format!("\"{}\"", &variable.name)
}
}
}
}
}
pub(crate) fn build_parse_table(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
state_ids_to_log: Vec<usize>,
) -> Result<(ParseTable, Vec<LookaheadSet>)> {
ParseTableBuilder {
syntax_grammar,
lexical_grammar,
state_ids_to_log,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
alias_sequences: Vec::new(),
max_aliased_production_length: 0,
},
following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()],
}
.build()
}

View file

@ -0,0 +1,71 @@
use crate::grammars::LexicalGrammar;
use crate::rules::Symbol;
use crate::tables::{ParseStateId, ParseTable};
use std::fmt;
pub(crate) struct CoincidentTokenIndex<'a> {
entries: Vec<Vec<ParseStateId>>,
grammar: &'a LexicalGrammar,
n: usize,
}
impl<'a> CoincidentTokenIndex<'a> {
pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self {
let n = lexical_grammar.variables.len();
let mut result = Self {
n,
grammar: lexical_grammar,
entries: vec![Vec::new(); n * n],
};
for (i, state) in table.states.iter().enumerate() {
for symbol in state.terminal_entries.keys() {
for other_symbol in state.terminal_entries.keys() {
let index = result.index(symbol.index, other_symbol.index);
if result.entries[index].last().cloned() != Some(i) {
result.entries[index].push(i);
}
}
}
}
result
}
pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
&self.entries[self.index(a.index, b.index)]
}
pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
!self.entries[self.index(a.index, b.index)].is_empty()
}
fn index(&self, a: usize, b: usize) -> usize {
if a < b {
a * self.n + b
} else {
b * self.n + a
}
}
}
impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "CoincidentTokenIndex {{\n")?;
write!(f, " entries: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[j].name,
self.entries[self.index(i, j)].len()
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View file

@ -0,0 +1,446 @@
use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
use crate::rules::Associativity;
use crate::rules::{Symbol, SymbolType};
use smallbitvec::SmallBitVec;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::u32;
lazy_static! {
static ref START_PRODUCTION: Production = Production {
dynamic_precedence: 0,
steps: vec![ProductionStep {
symbol: Symbol {
index: 0,
kind: SymbolType::NonTerminal,
},
precedence: 0,
associativity: None,
alias: None,
}],
};
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub(crate) struct LookaheadSet {
terminal_bits: SmallBitVec,
external_bits: SmallBitVec,
eof: bool,
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct ParseItem<'a> {
pub variable_index: u32,
pub step_index: u32,
pub production: &'a Production,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSet<'a> {
pub entries: BTreeMap<ParseItem<'a>, LookaheadSet>,
}
pub(crate) struct ParseItemDisplay<'a>(
pub &'a ParseItem<'a>,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar);
#[allow(dead_code)]
pub(crate) struct ParseItemSetDisplay<'a>(
pub &'a ParseItemSet<'a>,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
impl LookaheadSet {
pub fn new() -> Self {
Self {
terminal_bits: SmallBitVec::new(),
external_bits: SmallBitVec::new(),
eof: false,
}
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item = Symbol> + 'a {
self.terminal_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::terminal(i))
} else {
None
}
})
.chain(
self.external_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::external(i))
} else {
None
}
}),
)
.chain(if self.eof { Some(Symbol::end()) } else { None })
}
pub fn with(symbols: impl IntoIterator<Item = Symbol>) -> Self {
let mut result = Self::new();
for symbol in symbols {
result.insert(symbol);
}
result
}
pub fn contains(&self, symbol: &Symbol) -> bool {
match symbol.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false),
SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false),
SymbolType::End => self.eof,
}
}
pub fn insert(&mut self, other: Symbol) {
let vec = match other.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => &mut self.terminal_bits,
SymbolType::External => &mut self.external_bits,
SymbolType::End => {
self.eof = true;
return;
}
};
if other.index >= vec.len() {
vec.resize(other.index + 1, false);
}
vec.set(other.index, true);
}
pub fn insert_all(&mut self, other: &LookaheadSet) -> bool {
let mut result = false;
if other.terminal_bits.len() > self.terminal_bits.len() {
self.terminal_bits.resize(other.terminal_bits.len(), false);
}
if other.external_bits.len() > self.external_bits.len() {
self.external_bits.resize(other.external_bits.len(), false);
}
for (i, element) in other.terminal_bits.iter().enumerate() {
if element {
result |= !self.terminal_bits[i];
self.terminal_bits.set(i, element);
}
}
for (i, element) in other.external_bits.iter().enumerate() {
if element {
result |= !self.external_bits[i];
self.external_bits.set(i, element);
}
}
if other.eof {
result |= !self.eof;
self.eof = true;
}
result
}
}
impl<'a> ParseItem<'a> {
pub fn start() -> Self {
ParseItem {
variable_index: u32::MAX,
production: &START_PRODUCTION,
step_index: 0,
}
}
pub fn step(&self) -> Option<&'a ProductionStep> {
self.production.steps.get(self.step_index as usize)
}
pub fn symbol(&self) -> Option<Symbol> {
self.step().map(|step| step.symbol)
}
pub fn associativity(&self) -> Option<Associativity> {
self.prev_step().and_then(|step| step.associativity)
}
pub fn precedence(&self) -> i32 {
self.prev_step().map_or(0, |step| step.precedence)
}
pub fn prev_step(&self) -> Option<&'a ProductionStep> {
if self.step_index > 0 {
Some(&self.production.steps[self.step_index as usize - 1])
} else {
None
}
}
pub fn is_done(&self) -> bool {
self.step_index as usize == self.production.steps.len()
}
pub fn is_augmented(&self) -> bool {
self.variable_index == u32::MAX
}
pub fn successor(&self) -> ParseItem<'a> {
ParseItem {
variable_index: self.variable_index,
production: self.production,
step_index: self.step_index + 1,
}
}
}
impl<'a> ParseItemSet<'a> {
pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, LookaheadSet)>) -> Self {
let mut result = Self::default();
for (item, lookaheads) in elements {
result.entries.insert(item, lookaheads);
}
result
}
pub fn hash_unfinished_items(&self, h: &mut impl Hasher) {
let mut previous_variable_index = u32::MAX;
let mut previous_step_index = u32::MAX;
for item in self.entries.keys() {
if item.step().is_none() && item.variable_index != previous_variable_index
|| item.step_index != previous_step_index
{
h.write_u32(item.variable_index);
h.write_u32(item.step_index);
previous_variable_index = item.variable_index;
previous_step_index = item.step_index;
}
}
}
}
impl<'a> Default for ParseItemSet<'a> {
fn default() -> Self {
Self {
entries: BTreeMap::new(),
}
}
}
#[allow(dead_code)]
impl<'a> fmt::Display for ParseItemDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if self.0.is_augmented() {
write!(f, "START →")?;
} else {
write!(
f,
"{} →",
&self.1.variables[self.0.variable_index as usize].name
)?;
}
for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize {
write!(f, "")?;
if step.precedence != 0 || step.associativity.is_some() {
write!(
f,
" (prec {:?} assoc {:?})",
step.precedence, step.associativity
)?;
}
}
write!(f, " ")?;
if step.symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(step.symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", step.symbol.index)?;
}
} else if step.symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
}
if let Some(alias) = &step.alias {
write!(f, " (alias {})", alias.value)?;
}
}
if self.0.is_done() {
write!(f, "")?;
if let Some(step) = self.0.production.steps.last() {
if step.precedence != 0 || step.associativity.is_some() {
write!(
f,
" (prec {:?} assoc {:?})",
step.precedence, step.associativity
)?;
}
}
}
Ok(())
}
}
impl<'a> fmt::Display for LookaheadSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "[")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
if symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", symbol.index)?;
}
} else if symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[symbol.index].name)?;
}
}
write!(f, "]")?;
Ok(())
}
}
impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for (item, lookaheads) in self.0.entries.iter() {
writeln!(
f,
"{}\t{}",
ParseItemDisplay(item, self.1, self.2),
LookaheadSetDisplay(lookaheads, self.1, self.2)
)?;
}
Ok(())
}
}
impl<'a> Hash for ParseItem<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_u32(self.variable_index);
hasher.write_u32(self.step_index);
hasher.write_i32(self.production.dynamic_precedence);
hasher.write_usize(self.production.steps.len());
hasher.write_i32(self.precedence());
self.associativity().hash(hasher);
for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher);
}
for step in &self.production.steps[self.step_index as usize..] {
step.hash(hasher);
}
}
}
impl<'a> PartialEq for ParseItem<'a> {
fn eq(&self, other: &Self) -> bool {
if self.variable_index != other.variable_index
|| self.step_index != other.step_index
|| self.production.dynamic_precedence != other.production.dynamic_precedence
|| self.production.steps.len() != other.production.steps.len()
|| self.precedence() != other.precedence()
|| self.associativity() != other.associativity()
{
return false;
}
for (i, step) in self.production.steps.iter().enumerate() {
if i < self.step_index as usize {
if step.alias != other.production.steps[i].alias {
return false;
}
} else {
if *step != other.production.steps[i] {
return false;
}
}
}
return true;
}
}
impl<'a> Ord for ParseItem<'a> {
fn cmp(&self, other: &Self) -> Ordering {
let o = self.variable_index.cmp(&other.variable_index);
if o != Ordering::Equal {
return o;
}
let o = self.step_index.cmp(&other.step_index);
if o != Ordering::Equal {
return o;
}
let o = self
.production
.dynamic_precedence
.cmp(&other.production.dynamic_precedence);
if o != Ordering::Equal {
return o;
}
let o = self
.production
.steps
.len()
.cmp(&other.production.steps.len());
if o != Ordering::Equal {
return o;
}
let o = self.precedence().cmp(&other.precedence());
if o != Ordering::Equal {
return o;
}
let o = self.associativity().cmp(&other.associativity());
if o != Ordering::Equal {
return o;
}
for (i, step) in self.production.steps.iter().enumerate() {
let o = if i < self.step_index as usize {
step.alias.cmp(&other.production.steps[i].alias)
} else {
step.cmp(&other.production.steps[i])
};
if o != Ordering::Equal {
return o;
}
}
return Ordering::Equal;
}
}
impl<'a> PartialOrd for ParseItem<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a> Eq for ParseItem<'a> {}
impl<'a> Hash for ParseItemSet<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len());
for (item, lookaheads) in self.entries.iter() {
item.hash(hasher);
lookaheads.hash(hasher);
}
}
}

View file

@ -0,0 +1,330 @@
use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet};
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::rules::Symbol;
use hashbrown::{HashMap, HashSet};
use std::fmt;
#[derive(Clone, Debug, PartialEq, Eq)]
struct TransitiveClosureAddition<'a> {
item: ParseItem<'a>,
info: FollowSetInfo,
}
#[derive(Clone, Debug, PartialEq, Eq)]
struct FollowSetInfo {
lookaheads: LookaheadSet,
propagates_lookaheads: bool,
}
pub(crate) struct ParseItemSetBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
first_sets: HashMap<Symbol, LookaheadSet>,
last_sets: HashMap<Symbol, LookaheadSet>,
inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
}
fn find_or_push<T: Eq>(vector: &mut Vec<T>, value: T) {
if !vector.contains(&value) {
vector.push(value);
}
}
impl<'a> ParseItemSetBuilder<'a> {
pub fn new(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
) -> Self {
let mut result = Self {
syntax_grammar,
lexical_grammar,
first_sets: HashMap::new(),
last_sets: HashMap::new(),
inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
};
// For each grammar symbol, populate the FIRST and LAST sets: the set of
// terminals that appear at the beginning and end that symbol's productions,
// respectively.
//
// For a terminal symbol, the FIRST and LAST set just consists of the
// terminal itself.
for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i);
let mut set = LookaheadSet::new();
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
}
for i in 0..syntax_grammar.external_tokens.len() {
let symbol = Symbol::external(i);
let mut set = LookaheadSet::new();
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
}
// The FIRST set of a non-terminal `i` is the union of the following sets:
// * the set of all terminals that appear at the beginings of i's productions
// * the FIRST sets of all the non-terminals that appear at the beginnings
// of i's productions
//
// Rather than computing these sets using recursion, we use an explicit stack
// called `symbols_to_process`.
let mut symbols_to_process = Vec::new();
let mut processed_non_terminals = HashSet::new();
for i in 0..syntax_grammar.variables.len() {
let symbol = Symbol::non_terminal(i);
let first_set = &mut result
.first_sets
.entry(symbol)
.or_insert(LookaheadSet::new());
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
first_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in syntax_grammar.variables[current_symbol.index]
.productions
.iter()
{
if let Some(step) = production.steps.first() {
symbols_to_process.push(step.symbol);
}
}
}
}
// The LAST set is defined in a similar way to the FIRST set.
let last_set = &mut result
.last_sets
.entry(symbol)
.or_insert(LookaheadSet::new());
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
last_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in syntax_grammar.variables[current_symbol.index]
.productions
.iter()
{
if let Some(step) = production.steps.last() {
symbols_to_process.push(step.symbol);
}
}
}
}
}
// To compute an item set's transitive closure, we find each item in the set
// whose next symbol is a non-terminal, and we add new items to the set for
// each of that symbols' productions. These productions might themselves begin
// with non-terminals, so the process continues recursively. In this process,
// the total set of entries that get added depends only on two things:
// * the set of non-terminal symbols that occur at each item's current position
// * the set of terminals that occurs after each of these non-terminal symbols
//
// So we can avoid a lot of duplicated recursive work by precomputing, for each
// non-terminal symbol `i`, a final list of *additions* that must be made to an
// item set when `i` occurs as the next symbol in one if its core items. The
// structure of an *addition* is as follows:
// * `item` - the new item that must be added as part of the expansion of `i`
// * `lookaheads` - lookahead tokens that can always come after that item in
// the expansion of `i`
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can
// occur at the *end* of the expansion of `i`, so that i's own current
// lookahead tokens can occur after `item`.
//
// Again, rather than computing these additions recursively, we use an explicit
// stack called `entries_to_process`.
for i in 0..syntax_grammar.variables.len() {
let empty_lookaheads = LookaheadSet::new();
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
// First, build up a map whose keys are all of the non-terminals that can
// appear at the beginning of non-terminal `i`, and whose values store
// information about the tokens that can follow each non-terminal.
let mut follow_set_info_by_non_terminal = HashMap::new();
while let Some(entry) = entries_to_process.pop() {
let (variable_index, lookaheads, propagates_lookaheads) = entry;
let existing_info = follow_set_info_by_non_terminal
.entry(variable_index)
.or_insert_with(|| FollowSetInfo {
lookaheads: LookaheadSet::new(),
propagates_lookaheads: false,
});
let did_add_follow_set_info;
if propagates_lookaheads {
did_add_follow_set_info = !existing_info.propagates_lookaheads;
existing_info.propagates_lookaheads = true;
} else {
did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
}
if did_add_follow_set_info {
for production in &syntax_grammar.variables[variable_index].productions {
if let Some(symbol) = production.first_symbol() {
if symbol.is_non_terminal() {
if production.steps.len() == 1 {
entries_to_process.push((
symbol.index,
lookaheads,
propagates_lookaheads,
));
} else {
entries_to_process.push((
symbol.index,
&result.first_sets[&production.steps[1].symbol],
false,
));
}
}
}
}
}
}
// Store all of those non-terminals' productions, along with their associated
// lookahead info, as *additions* associated with non-terminal `i`.
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index);
let variable_index = variable_index as u32;
if syntax_grammar.variables_to_inline.contains(&non_terminal) {
continue;
}
for production in &variable.productions {
let item = ParseItem {
variable_index,
production,
step_index: 0,
};
if let Some(inlined_productions) =
inlines.inlined_productions(item.production, item.step_index)
{
for production in inlined_productions {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item: ParseItem {
variable_index,
production,
step_index: item.step_index,
},
info: follow_set_info.clone(),
},
);
}
} else {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item,
info: follow_set_info.clone(),
},
);
}
}
}
}
result
}
pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default();
for (item, lookaheads) in &item_set.entries {
if let Some(productions) = self
.inlines
.inlined_productions(item.production, item.step_index)
{
for production in productions {
self.add_item(
&mut result,
ParseItem {
variable_index: item.variable_index,
production,
step_index: item.step_index,
},
lookaheads,
);
}
} else {
self.add_item(&mut result, *item, lookaheads);
}
}
result
}
pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet {
&self.first_sets[symbol]
}
pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet {
&self.first_sets[symbol]
}
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) {
if let Some(step) = item.step() {
if step.symbol.is_non_terminal() {
let next_step = item.successor().step();
// Determine which tokens can follow this non-terminal.
let following_tokens = if let Some(next_step) = next_step {
self.first_sets.get(&next_step.symbol).unwrap()
} else {
&lookaheads
};
// Use the pre-computed *additions* to expand the non-terminal.
for addition in &self.transitive_closure_additions[step.symbol.index] {
let lookaheads = set
.entries
.entry(addition.item)
.or_insert_with(|| LookaheadSet::new());
lookaheads.insert_all(&addition.info.lookaheads);
if addition.info.propagates_lookaheads {
lookaheads.insert_all(following_tokens);
}
}
}
}
set.entries.insert(item, lookaheads.clone());
}
}
impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "ParseItemSetBuilder {{\n")?;
write!(f, " additions: {{\n")?;
for (i, variable) in self.syntax_grammar.variables.iter().enumerate() {
write!(f, " {}: {{\n", variable.name)?;
for addition in &self.transitive_closure_additions[i] {
write!(
f,
" {}\n",
ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar)
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View file

@ -0,0 +1,281 @@
use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{SyntaxGrammar, VariableType};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry};
use hashbrown::{HashMap, HashSet};
pub(crate) fn minimize_parse_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap,
keywords: &LookaheadSet,
) {
let mut minimizer = Minimizer {
parse_table,
syntax_grammar,
token_conflict_map,
keywords,
simple_aliases,
};
minimizer.remove_unit_reductions();
minimizer.merge_compatible_states();
minimizer.remove_unused_states();
}
struct Minimizer<'a> {
parse_table: &'a mut ParseTable,
syntax_grammar: &'a SyntaxGrammar,
token_conflict_map: &'a TokenConflictMap<'a>,
keywords: &'a LookaheadSet,
simple_aliases: &'a AliasMap,
}
impl<'a> Minimizer<'a> {
fn remove_unit_reductions(&mut self) {
let mut aliased_symbols = HashSet::new();
for variable in &self.syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if step.alias.is_some() {
aliased_symbols.insert(step.symbol);
}
}
}
}
let mut unit_reduction_symbols_by_state = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
let mut only_unit_reductions = true;
let mut unit_reduction_symbol = None;
for (_, entry) in &state.terminal_entries {
for action in &entry.actions {
match action {
ParseAction::ShiftExtra => continue,
ParseAction::Reduce {
child_count: 1,
alias_sequence_id: 0,
symbol,
..
} => {
if !self.simple_aliases.contains_key(&symbol)
&& !aliased_symbols.contains(&symbol)
&& self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named
&& (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol))
{
unit_reduction_symbol = Some(symbol);
continue;
}
}
_ => {}
}
only_unit_reductions = false;
break;
}
if !only_unit_reductions {
break;
}
}
if let Some(symbol) = unit_reduction_symbol {
if only_unit_reductions {
unit_reduction_symbols_by_state.insert(i, *symbol);
}
}
}
for state in self.parse_table.states.iter_mut() {
let mut done = false;
while !done {
done = true;
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
} else {
other_state_id
}
})
}
}
}
fn merge_compatible_states(&mut self) {
let mut state_ids_by_signature = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
state_ids_by_signature
.entry(state.unfinished_item_signature)
.or_insert(Vec::new())
.push(i);
}
let mut deleted_states = HashSet::new();
loop {
let mut state_replacements = HashMap::new();
for (_, state_ids) in &state_ids_by_signature {
for i in state_ids {
for j in state_ids {
if j == i {
break;
}
if deleted_states.contains(j) || deleted_states.contains(i) {
continue;
}
if self.merge_parse_state(*j, *i) {
deleted_states.insert(*i);
state_replacements.insert(*i, *j);
}
}
}
}
if state_replacements.is_empty() {
break;
}
for state in self.parse_table.states.iter_mut() {
state.update_referenced_states(|other_state_id, _| {
*state_replacements
.get(&other_state_id)
.unwrap_or(&other_state_id)
});
}
}
}
fn merge_parse_state(&mut self, left: usize, right: usize) -> bool {
let left_state = &self.parse_table.states[left];
let right_state = &self.parse_table.states[right];
if left_state.nonterminal_entries != right_state.nonterminal_entries {
return false;
}
for (symbol, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
if right_entry.actions != left_entry.actions {
return false;
}
} else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) {
return false;
}
}
let mut symbols_to_add = Vec::new();
for (symbol, right_entry) in &right_state.terminal_entries {
if !left_state.terminal_entries.contains_key(&symbol) {
if !self.can_add_entry_to_state(left_state, *symbol, right_entry) {
return false;
}
symbols_to_add.push(*symbol);
}
}
for symbol in symbols_to_add {
let entry = self.parse_table.states[right].terminal_entries[&symbol].clone();
self.parse_table.states[left]
.terminal_entries
.insert(symbol, entry);
}
true
}
fn can_add_entry_to_state(
&self,
state: &ParseState,
token: Symbol,
entry: &ParseTableEntry,
) -> bool {
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if token.is_external() {
return false;
}
// Only merge_compatible_states parse states by allowing existing reductions to happen
// with additional lookahead tokens. Do not alter parse states in ways
// that allow entirely new types of actions to happen.
if state.terminal_entries.iter().all(|(_, e)| e != entry) {
return false;
}
match entry.actions.last() {
Some(ParseAction::Reduce { .. }) => {}
_ => return false,
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if self
.syntax_grammar
.external_tokens
.iter()
.any(|t| t.corresponding_internal_token == Some(token))
{
return false;
}
let is_word_token = self.syntax_grammar.word_token == Some(token);
let is_keyword = self.keywords.contains(&token);
// Do not add a token if it conflicts with an existing token.
if token.is_terminal() {
for existing_token in state.terminal_entries.keys() {
if (is_word_token && self.keywords.contains(existing_token))
|| is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token)
{
continue;
}
if self
.token_conflict_map
.does_conflict(token.index, existing_token.index)
|| self
.token_conflict_map
.does_match_same_string(token.index, existing_token.index)
{
return false;
}
}
}
true
}
fn remove_unused_states(&mut self) {
let mut state_usage_map = vec![false; self.parse_table.states.len()];
state_usage_map[0] = true;
state_usage_map[1] = true;
for state in &self.parse_table.states {
for referenced_state in state.referenced_states() {
state_usage_map[referenced_state] = true;
}
}
let mut removed_predecessor_count = 0;
let mut state_replacement_map = vec![0; self.parse_table.states.len()];
for state_id in 0..self.parse_table.states.len() {
state_replacement_map[state_id] = state_id - removed_predecessor_count;
if !state_usage_map[state_id] {
removed_predecessor_count += 1;
}
}
let mut state_id = 0;
let mut original_state_id = 0;
while state_id < self.parse_table.states.len() {
if state_usage_map[original_state_id] {
self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
state_replacement_map[other_state_id]
});
state_id += 1;
} else {
self.parse_table.states.remove(state_id);
}
original_state_id += 1;
}
}
}

285
cli/src/build_tables/mod.rs Normal file
View file

@ -0,0 +1,285 @@
mod build_lex_table;
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod minimize_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::build_parse_table;
use self::coincident_tokens::CoincidentTokenIndex;
use self::item::LookaheadSet;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::nfa::{CharacterSet, NfaCursor};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap,
inlines: &InlinedProductionMap,
minimize: bool,
state_ids_to_log: Vec<usize>,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
let (mut parse_table, following_tokens) =
build_parse_table(syntax_grammar, lexical_grammar, inlines, state_ids_to_log)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords(
lexical_grammar,
&parse_table,
syntax_grammar.word_token,
&token_conflict_map,
&coincident_token_index,
);
populate_error_state(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&coincident_token_index,
&token_conflict_map,
);
mark_fragile_tokens(
&mut parse_table,
lexical_grammar,
&token_conflict_map,
);
if minimize {
minimize_parse_table(
&mut parse_table,
syntax_grammar,
simple_aliases,
&token_conflict_map,
&keywords,
);
}
let (main_lex_table, keyword_lex_table) = build_lex_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&keywords,
minimize,
);
Ok((
parse_table,
main_lex_table,
keyword_lex_table,
syntax_grammar.word_token,
))
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
// First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way.
let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
j != i
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
&& token_conflict_map.does_conflict(i, j)
});
if conflicts_with_other_tokens {
None
} else {
info!(
"error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
}
}));
let recover_entry = ParseTableEntry {
reusable: false,
actions: vec![ParseAction::Recover],
};
// Exclude from the error-recovery state any token that conflicts with one of
// the *conflict-free tokens* identified above.
for i in 0..n {
let symbol = Symbol::terminal(i);
if !conflict_free_tokens.contains(&symbol) {
if syntax_grammar.word_token != Some(symbol) {
if let Some(t) = conflict_free_tokens.iter().find(|t| {
!coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index)
}) {
info!(
"error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
);
continue;
}
}
}
info!(
"error recovery - include token {}",
lexical_grammar.variables[i].name
);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if external_token.corresponding_internal_token.is_none() {
state
.terminal_entries
.entry(Symbol::external(i))
.or_insert_with(|| recover_entry.clone());
}
}
state.terminal_entries.insert(Symbol::end(), recover_entry);
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
word_token: Option<Symbol>,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> LookaheadSet {
if word_token.is_none() {
return LookaheadSet::new();
}
let word_token = word_token.unwrap();
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
// First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token.
let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map(
|(i, variable)| {
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
{
info!(
"Keywords - add candidate {}",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
} else {
None
}
},
));
// Exclude keyword candidates that shadow another keyword candidate.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_token in keywords.iter() {
if other_token != *token
&& token_conflict_map.does_match_same_string(token.index, other_token.index)
{
info!(
"Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name
);
return false;
}
}
true
}));
// Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_index in 0..lexical_grammar.variables.len() {
if keywords.contains(&Symbol::terminal(other_index)) {
continue;
}
// If the word token was already valid in every state containing
// this keyword candidate, then substituting the word token won't
// introduce any new lexical conflicts.
if coincident_token_index
.states_with(*token, Symbol::terminal(other_index))
.iter()
.all(|state_id| {
parse_table.states[*state_id]
.terminal_entries
.contains_key(&word_token)
})
{
continue;
}
if !token_conflict_map.has_same_conflict_status(
token.index,
word_token.index,
other_index,
) {
info!(
"Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name
);
return false;
}
}
info!(
"Keywords - include {}",
lexical_grammar.variables[token.index].name,
);
true
}));
keywords
}
fn mark_fragile_tokens(
parse_table: &mut ParseTable,
lexical_grammar: &LexicalGrammar,
token_conflict_map: &TokenConflictMap,
) {
let n = lexical_grammar.variables.len();
let mut valid_tokens_mask = Vec::with_capacity(n);
for state in parse_table.states.iter_mut() {
valid_tokens_mask.clear();
valid_tokens_mask.resize(n, false);
for token in state.terminal_entries.keys() {
if token.is_terminal() {
valid_tokens_mask[token.index] = true;
}
}
for (token, entry) in state.terminal_entries.iter_mut() {
for i in 0..n {
if token_conflict_map.does_overlap(i, token.index) {
if valid_tokens_mask[i] {
entry.reusable = false;
break;
}
}
}
}
}
}
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
cursor.transition_chars().all(|(chars, is_sep)| {
if is_sep {
true
} else if let CharacterSet::Include(chars) = chars {
chars.iter().all(|c| c.is_alphabetic() || *c == '_')
} else {
false
}
})
}

View file

@ -0,0 +1,382 @@
use crate::build_tables::item::LookaheadSet;
use crate::grammars::LexicalGrammar;
use crate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use hashbrown::HashSet;
use std::cmp::Ordering;
use std::fmt;
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct TokenConflictStatus {
does_overlap: bool,
does_match_valid_continuation: bool,
does_match_separators: bool,
matches_same_string: bool,
}
pub(crate) struct TokenConflictMap<'a> {
n: usize,
status_matrix: Vec<TokenConflictStatus>,
starting_chars_by_index: Vec<CharacterSet>,
following_chars_by_index: Vec<CharacterSet>,
grammar: &'a LexicalGrammar,
}
impl<'a> TokenConflictMap<'a> {
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let starting_chars = get_starting_chars(&mut cursor, grammar);
let following_chars = get_following_chars(&starting_chars, following_tokens);
let n = grammar.variables.len();
let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
for i in 0..grammar.variables.len() {
for j in 0..i {
let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j);
status_matrix[matrix_index(n, i, j)] = status.0;
status_matrix[matrix_index(n, j, i)] = status.1;
}
}
TokenConflictMap {
n,
status_matrix,
starting_chars_by_index: starting_chars,
following_chars_by_index: following_chars,
grammar,
}
}
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
let left = &self.status_matrix[matrix_index(self.n, a, other)];
let right = &self.status_matrix[matrix_index(self.n, b, other)];
left == right
}
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
}
pub fn does_conflict(&self, i: usize, j: usize) -> bool {
let entry = &self.status_matrix[matrix_index(self.n, i, j)];
entry.does_match_valid_continuation || entry.does_match_separators
}
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].does_overlap
}
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
if left.0 > right.0 {
return true;
} else if left.0 < right.0 {
return false;
}
match grammar.variables[left.1]
.implicit_precedence
.cmp(&grammar.variables[right.1].implicit_precedence)
{
Ordering::Less => false,
Ordering::Greater => true,
Ordering::Equal => left.1 < right.1,
}
}
}
impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TokenConflictMap {{\n")?;
write!(f, " starting_characters: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?;
}
write!(f, " }},\n")?;
write!(f, " following_characters: {{\n")?;
for i in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[i].name, self.following_chars_by_index[i]
)?;
}
write!(f, " }},\n")?;
write!(f, " status_matrix: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[j].name,
self.status_matrix[matrix_index(self.n, i, j)]
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}
fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize {
variable_count * i + j
}
fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<CharacterSet> {
let mut result = Vec::with_capacity(grammar.variables.len());
for variable in &grammar.variables {
cursor.reset(vec![variable.start_state]);
let mut all_chars = CharacterSet::empty();
for (chars, _) in cursor.transition_chars() {
all_chars = all_chars.add(chars);
}
result.push(all_chars);
}
result
}
fn get_following_chars(
starting_chars: &Vec<CharacterSet>,
following_tokens: Vec<LookaheadSet>,
) -> Vec<CharacterSet> {
following_tokens
.into_iter()
.map(|following_tokens| {
let mut chars = CharacterSet::empty();
for token in following_tokens.iter() {
if token.is_terminal() {
chars = chars.add(&starting_chars[token.index]);
}
}
chars
})
.collect()
}
fn compute_conflict_status(
cursor: &mut NfaCursor,
grammar: &LexicalGrammar,
following_chars: &Vec<CharacterSet>,
i: usize,
j: usize,
) -> (TokenConflictStatus, TokenConflictStatus) {
let mut visited_state_sets = HashSet::new();
let mut state_set_queue = vec![vec![
grammar.variables[i].start_state,
grammar.variables[j].start_state,
]];
let mut result = (
TokenConflictStatus::default(),
TokenConflictStatus::default(),
);
while let Some(state_set) = state_set_queue.pop() {
// Don't pursue states where there's no potential for conflict.
if variable_ids_for_states(&state_set, grammar).count() > 1 {
cursor.reset(state_set);
} else {
continue;
}
let mut completion = None;
for (id, precedence) in cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if id == prev_id {
continue;
}
// Prefer tokens with higher precedence. For tokens with equal precedence,
// prefer those listed earlier in the grammar.
let winning_id;
if TokenConflictMap::prefer_token(
grammar,
(prev_precedence, prev_id),
(precedence, id),
) {
winning_id = prev_id;
} else {
winning_id = id;
completion = Some((id, precedence));
}
if winning_id == i {
result.0.matches_same_string = true;
result.0.does_overlap = true;
} else {
result.1.matches_same_string = true;
result.1.does_overlap = true;
}
} else {
completion = Some((id, precedence));
}
}
for NfaTransition {
characters,
precedence,
states,
is_separator,
} in cursor.transitions()
{
let mut can_advance = true;
if let Some((completed_id, completed_precedence)) = completion {
let mut other_id = None;
let mut successor_contains_completed_id = false;
for variable_id in variable_ids_for_states(&states, grammar) {
if variable_id == completed_id {
successor_contains_completed_id = true;
break;
} else {
other_id = Some(variable_id);
}
}
if let (Some(other_id), false) = (other_id, successor_contains_completed_id) {
let winning_id;
if precedence < completed_precedence {
winning_id = completed_id;
can_advance = false;
} else {
winning_id = other_id;
}
if winning_id == i {
result.0.does_overlap = true;
if characters.does_intersect(&following_chars[j]) {
result.0.does_match_valid_continuation = true;
}
if is_separator {
result.0.does_match_separators = true;
}
} else {
result.1.does_overlap = true;
if characters.does_intersect(&following_chars[i]) {
result.1.does_match_valid_continuation = true;
}
}
}
}
if can_advance && visited_state_sets.insert(states.clone()) {
state_set_queue.push(states);
}
}
}
result
}
fn variable_ids_for_states<'a>(
state_ids: &'a Vec<u32>,
grammar: &'a LexicalGrammar,
) -> impl Iterator<Item = usize> + 'a {
let mut prev = None;
state_ids.iter().filter_map(move |state_id| {
let variable_id = grammar.variable_index_for_nfa_state(*state_id);
if prev != Some(variable_id) {
prev = Some(variable_id);
prev
} else {
None
}
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{Variable, VariableType};
use crate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
use crate::rules::{Rule, Symbol};
#[test]
fn test_starting_characters() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "token_0".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("[a-f]1|0x\\d"),
},
Variable {
name: "token_1".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("d*ef"),
},
],
})
.unwrap();
let token_map = TokenConflictMap::new(&grammar, Vec::new());
assert_eq!(
token_map.starting_chars_by_index[0],
CharacterSet::empty().add_range('a', 'f').add_char('0')
);
assert_eq!(
token_map.starting_chars_by_index[1],
CharacterSet::empty().add_range('d', 'e')
);
}
#[test]
fn test_token_conflicts() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "in".to_string(),
kind: VariableType::Named,
rule: Rule::string("in"),
},
Variable {
name: "identifier".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("\\w+"),
},
Variable {
name: "instanceof".to_string(),
kind: VariableType::Named,
rule: Rule::string("instanceof"),
},
],
})
.unwrap();
let var = |name| index_of_var(&grammar, name);
let token_map = TokenConflictMap::new(
&grammar,
vec![
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()),
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
],
);
// Given the string "in", the `in` token is preferred over the `identifier` token
assert!(token_map.does_match_same_string(var("in"), var("identifier")));
assert!(!token_map.does_match_same_string(var("identifier"), var("in")));
// Depending on what character follows, the string "in" may be treated as part of an
// `identifier` token.
assert!(token_map.does_conflict(var("identifier"), var("in")));
// Depending on what character follows, the string "instanceof" may be treated as part of
// an `identifier` token.
assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
assert!(token_map.does_conflict(var("instanceof"), var("in")));
}
fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
grammar
.variables
.iter()
.position(|v| v.name == name)
.unwrap()
}
}

24
cli/src/error.rs Normal file
View file

@ -0,0 +1,24 @@
#[derive(Debug)]
pub struct Error(pub String);
pub type Result<T> = std::result::Result<T, Error>;
impl Error {
pub fn grammar(message: &str) -> Self {
Error(format!("Grammar error: {}", message))
}
pub fn regex(message: &str) -> Self {
Error(format!("Regex error: {}", message))
}
pub fn undefined_symbol(name: &str) -> Self {
Error(format!("Undefined symbol `{}`", name))
}
}
impl From<serde_json::Error> for Error {
fn from(error: serde_json::Error) -> Self {
Error(error.to_string())
}
}

34
cli/src/generate.rs Normal file
View file

@ -0,0 +1,34 @@
use crate::build_tables::build_tables;
use crate::error::Result;
use crate::parse_grammar::parse_grammar;
use crate::prepare_grammar::prepare_grammar;
use crate::render::render_c_code;
pub fn generate_parser_for_grammar(
input: &str,
minimize: bool,
state_ids_to_log: Vec<usize>,
) -> Result<String> {
let input_grammar = parse_grammar(input)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&inlines,
minimize,
state_ids_to_log,
)?;
let c_code = render_c_code(
&input_grammar.name,
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
syntax_grammar,
lexical_grammar,
simple_aliases,
);
Ok(c_code)
}

204
cli/src/grammars.rs Normal file
View file

@ -0,0 +1,204 @@
use crate::nfa::Nfa;
use crate::rules::{Alias, Associativity, Rule, Symbol};
use hashbrown::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum VariableType {
Hidden,
Auxiliary,
Anonymous,
Named,
}
// Input grammar
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct Variable {
pub name: String,
pub kind: VariableType,
pub rule: Rule,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct InputGrammar {
pub name: String,
pub variables: Vec<Variable>,
pub extra_tokens: Vec<Rule>,
pub expected_conflicts: Vec<Vec<String>>,
pub external_tokens: Vec<Rule>,
pub variables_to_inline: Vec<String>,
pub word_token: Option<String>,
}
// Extracted lexical grammar
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexicalVariable {
pub name: String,
pub kind: VariableType,
pub implicit_precedence: i32,
pub start_state: u32,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct LexicalGrammar {
pub nfa: Nfa,
pub variables: Vec<LexicalVariable>,
}
// Extracted syntax grammar
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct ProductionStep {
pub symbol: Symbol,
pub precedence: i32,
pub associativity: Option<Associativity>,
pub alias: Option<Alias>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct Production {
pub steps: Vec<ProductionStep>,
pub dynamic_precedence: i32,
}
pub(crate) struct InlinedProductionMap {
pub productions: Vec<Production>,
pub production_map: HashMap<(*const Production, u32), Vec<usize>>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct SyntaxVariable {
pub name: String,
pub kind: VariableType,
pub productions: Vec<Production>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ExternalToken {
pub name: String,
pub kind: VariableType,
pub corresponding_internal_token: Option<Symbol>,
}
#[derive(Debug)]
pub(crate) struct SyntaxGrammar {
pub variables: Vec<SyntaxVariable>,
pub extra_tokens: Vec<Symbol>,
pub expected_conflicts: Vec<Vec<Symbol>>,
pub external_tokens: Vec<ExternalToken>,
pub variables_to_inline: Vec<Symbol>,
pub word_token: Option<Symbol>,
}
#[cfg(test)]
impl ProductionStep {
pub(crate) fn new(symbol: Symbol) -> Self {
Self {
symbol,
precedence: 0,
associativity: None,
alias: None,
}
}
pub(crate) fn with_prec(self, precedence: i32, associativity: Option<Associativity>) -> Self {
Self {
symbol: self.symbol,
precedence,
associativity,
alias: self.alias,
}
}
pub(crate) fn with_alias(self, value: &str, is_named: bool) -> Self {
Self {
symbol: self.symbol,
precedence: self.precedence,
associativity: self.associativity,
alias: Some(Alias {
value: value.to_string(),
is_named,
}),
}
}
}
impl Production {
pub fn first_symbol(&self) -> Option<Symbol> {
self.steps.first().map(|s| s.symbol.clone())
}
}
impl Default for Production {
fn default() -> Self {
Production {
dynamic_precedence: 0,
steps: Vec::new(),
}
}
}
#[cfg(test)]
impl Variable {
pub fn named(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Named,
rule,
}
}
pub fn auxiliary(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Auxiliary,
rule,
}
}
pub fn hidden(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Hidden,
rule,
}
}
pub fn anonymous(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Anonymous,
rule,
}
}
}
impl LexicalGrammar {
pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize {
self.variables.iter().position(|v| v.start_state >= state_id).unwrap()
}
}
impl SyntaxVariable {
pub fn is_auxiliary(&self) -> bool {
self.kind == VariableType::Auxiliary
}
}
impl InlinedProductionMap {
pub fn inlined_productions<'a>(
&'a self,
production: &Production,
step_index: u32,
) -> Option<impl Iterator<Item = &'a Production> + 'a> {
self.production_map
.get(&(production as *const Production, step_index))
.map(|production_indices| {
production_indices
.iter()
.cloned()
.map(move |index| &self.productions[index])
})
}
}

334
cli/src/js/dsl.js Normal file
View file

@ -0,0 +1,334 @@
const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi;
const DELIMITER_ESCAPE_PATTERN = /\\\//g;
function alias(rule, value) {
const result = {
type: "ALIAS",
content: normalize(rule),
named: false,
value: null
};
switch (value.constructor) {
case String:
result.named = false;
result.value = value;
return result;
case ReferenceError:
result.named = true;
result.value = value.symbol.name;
return result;
case Object:
if (typeof value.type === 'string' && value.type === 'SYMBOL') {
result.named = true;
result.value = value.name;
return result;
}
}
throw new Error('Invalid alias value ' + value);
}
function blank() {
return {
type: "BLANK"
};
}
function choice(...elements) {
return {
type: "CHOICE",
members: elements.map(normalize)
};
}
function optional(value) {
return choice(value, blank());
}
function prec(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC",
value: number,
content: normalize(rule)
};
}
prec.left = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC_LEFT",
value: number,
content: normalize(rule)
};
}
prec.right = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC_RIGHT",
value: number,
content: normalize(rule)
};
}
prec.dynamic = function(number, rule) {
return {
type: "PREC_DYNAMIC",
value: number,
content: normalize(rule)
};
}
function repeat(rule) {
return {
type: "REPEAT",
content: normalize(rule)
};
}
function repeat1(rule) {
return {
type: "REPEAT1",
content: normalize(rule)
};
}
function seq(...elements) {
return {
type: "SEQ",
members: elements.map(normalize)
};
}
function sym(name) {
return {
type: "SYMBOL",
name: name
};
}
function token(value) {
return {
type: "TOKEN",
content: normalize(value)
};
}
token.immediate = function(value) {
return {
type: "IMMEDIATE_TOKEN",
content: normalize(value)
};
}
function normalize(value) {
if (typeof value == "undefined")
throw new Error("Undefined symbol");
switch (value.constructor) {
case String:
return {
type: 'STRING',
value
};
case RegExp:
return {
type: 'PATTERN',
value: value.source
.replace(
DELIMITER_ESCAPE_PATTERN,
'/'
)
.replace(
UNICODE_ESCAPE_PATTERN,
(match, group) => String.fromCharCode(parseInt(group, 16))
)
};
case ReferenceError:
throw value
default:
if (typeof value.type === 'string') {
return value;
} else {
throw new TypeError("Invalid rule: " + value.toString());
}
}
}
function RuleBuilder(ruleMap) {
return new Proxy({}, {
get(target, propertyName) {
const symbol = {
type: 'SYMBOL',
name: propertyName
};
if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) {
return symbol;
} else {
const error = new ReferenceError(`Undefined symbol '${propertyName}'`);
error.symbol = symbol;
return error;
}
}
})
}
function grammar(baseGrammar, options) {
if (!options) {
options = baseGrammar;
baseGrammar = {
name: null,
rules: {},
extras: [normalize(/\s/)],
conflicts: [],
externals: [],
inline: []
};
}
let externals = baseGrammar.externals;
if (options.externals) {
if (typeof options.externals !== "function") {
throw new Error("Grammar's 'externals' property must be a function.");
}
const externalsRuleBuilder = RuleBuilder(null)
const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals);
if (!Array.isArray(externalRules)) {
throw new Error("Grammar's 'externals' property must return an array of rules.");
}
externals = externalRules.map(normalize);
}
const ruleMap = {};
for (const key in options.rules) {
ruleMap[key] = true;
}
for (const key in baseGrammar.rules) {
ruleMap[key] = true;
}
for (const external of externals) {
if (typeof external.name === 'string') {
ruleMap[external.name] = true;
}
}
const ruleBuilder = RuleBuilder(ruleMap);
const name = options.name;
if (typeof name !== "string") {
throw new Error("Grammar's 'name' property must be a string.");
}
if (!/^[a-zA-Z_]\w*$/.test(name)) {
throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters.");
}
let rules = Object.assign({}, baseGrammar.rules);
if (options.rules) {
if (typeof options.rules !== "object") {
throw new Error("Grammar's 'rules' property must be an object.");
}
for (const ruleName in options.rules) {
const ruleFn = options.rules[ruleName];
if (typeof ruleFn !== "function") {
throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not.");
}
rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName]));
}
}
let extras = baseGrammar.extras.slice();
if (options.extras) {
if (typeof options.extras !== "function") {
throw new Error("Grammar's 'extras' property must be a function.");
}
extras = options.extras
.call(ruleBuilder, ruleBuilder, baseGrammar.extras)
.map(normalize);
}
let word = baseGrammar.word;
if (options.word) {
word = options.word.call(ruleBuilder, ruleBuilder).name;
if (typeof word != 'string') {
throw new Error("Grammar's 'word' property must be a named rule.");
}
}
let conflicts = baseGrammar.conflicts;
if (options.conflicts) {
if (typeof options.conflicts !== "function") {
throw new Error("Grammar's 'conflicts' property must be a function.");
}
const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym));
const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules);
if (!Array.isArray(conflictRules)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
conflicts = conflictRules.map(conflictSet => {
if (!Array.isArray(conflictSet)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
return conflictSet.map(symbol => symbol.name);
});
}
let inline = baseGrammar.inline;
if (options.inline) {
if (typeof options.inline !== "function") {
throw new Error("Grammar's 'inline' property must be a function.");
}
const baseInlineRules = baseGrammar.inline.map(sym);
const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules);
if (!Array.isArray(inlineRules)) {
throw new Error("Grammar's inline must be an array of rules.");
}
inline = inlineRules.map(symbol => symbol.name);
}
if (Object.keys(rules).length == 0) {
throw new Error("Grammar must have at least one rule.");
}
return {name, word, rules, extras, conflicts, externals, inline};
}
global.alias = alias;
global.blank = blank;
global.choice = choice;
global.optional = optional;
global.prec = prec;
global.repeat = repeat;
global.repeat1 = repeat1;
global.seq = seq;
global.sym = sym;
global.token = token;
global.grammar = grammar;

29
cli/src/logger.rs Normal file
View file

@ -0,0 +1,29 @@
use log::{LevelFilter, Log, Metadata, Record};
struct Logger {
pub filter: Option<String>,
}
impl Log for Logger {
fn enabled(&self, _: &Metadata) -> bool {
true
}
fn log(&self, record: &Record) {
eprintln!(
"[{}] {}",
record
.module_path()
.unwrap_or_default()
.trim_start_matches("rust_tree_sitter_cli::"),
record.args()
);
}
fn flush(&self) {}
}
pub(crate) fn init() {
log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap();
log::set_max_level(LevelFilter::Info);
}

119
cli/src/main.rs Normal file
View file

@ -0,0 +1,119 @@
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate log;
#[macro_use]
extern crate serde_derive;
extern crate hashbrown;
extern crate serde_json;
use clap::{App, Arg, SubCommand};
use std::env;
use std::io::Write;
use std::path::PathBuf;
use std::process::{exit, Command, Stdio};
use std::usize;
mod build_tables;
mod error;
mod generate;
mod grammars;
mod logger;
mod nfa;
mod parse_grammar;
mod prepare_grammar;
mod render;
mod rules;
mod tables;
fn main() {
if let Err(e) = run() {
eprintln!("{}", e.0);
exit(1);
}
}
fn run() -> error::Result<()> {
let matches = App::new("tree-sitter")
.version("0.1")
.author("Max Brunsfeld <maxbrunsfeld@gmail.com>")
.about("Generates and tests parsers")
.subcommand(
SubCommand::with_name("generate")
.about("Generate a parser")
.arg(Arg::with_name("log").long("log"))
.arg(
Arg::with_name("state-ids-to-log")
.long("log-state")
.takes_value(true),
)
.arg(Arg::with_name("no-minimize").long("no-minimize")),
)
.subcommand(
SubCommand::with_name("parse")
.about("Parse a file")
.arg(Arg::with_name("path").index(1)),
)
.subcommand(
SubCommand::with_name("test")
.about("Run a parser's tests")
.arg(Arg::with_name("path").index(1).required(true))
.arg(Arg::with_name("line").index(2).required(true))
.arg(Arg::with_name("column").index(3).required(true)),
)
.get_matches();
if let Some(matches) = matches.subcommand_matches("generate") {
if matches.is_present("log") {
logger::init();
}
let minimize = !matches.is_present("no-minimize");
let state_ids_to_log = matches
.values_of("state-ids-to-log")
.map_or(Vec::new(), |ids| {
ids.filter_map(|id| usize::from_str_radix(id, 10).ok())
.collect()
});
let mut grammar_path = env::current_dir().expect("Failed to read CWD");
grammar_path.push("grammar.js");
let grammar_json = load_js_grammar_file(grammar_path);
let code =
generate::generate_parser_for_grammar(&grammar_json, minimize, state_ids_to_log)?;
println!("{}", code);
}
Ok(())
}
fn load_js_grammar_file(grammar_path: PathBuf) -> String {
let mut node_process = Command::new("node")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.expect("Failed to run `node`");
let js_prelude = include_str!("./js/dsl.js");
let mut node_stdin = node_process
.stdin
.take()
.expect("Failed to open stdin for node");
write!(
node_stdin,
"{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n",
js_prelude,
grammar_path.to_str().unwrap()
)
.expect("Failed to write to node's stdin");
drop(node_stdin);
let output = node_process
.wait_with_output()
.expect("Failed to read output from node");
match output.status.code() {
None => panic!("Node process was killed"),
Some(0) => {}
Some(code) => panic!(format!("Node process exited with status {}", code)),
}
String::from_utf8(output.stdout).expect("Got invalid UTF8 from node")
}

771
cli/src/nfa.rs Normal file
View file

@ -0,0 +1,771 @@
use std::char;
use std::cmp::max;
use std::cmp::Ordering;
use std::fmt;
use std::mem::swap;
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum CharacterSet {
Include(Vec<char>),
Exclude(Vec<char>),
}
#[derive(Debug, PartialEq, Eq)]
pub enum NfaState {
Advance {
chars: CharacterSet,
state_id: u32,
is_sep: bool,
precedence: i32,
},
Split(u32, u32),
Accept {
variable_index: usize,
precedence: i32,
},
}
#[derive(PartialEq, Eq)]
pub struct Nfa {
pub states: Vec<NfaState>,
}
#[derive(Debug)]
pub struct NfaCursor<'a> {
pub(crate) state_ids: Vec<u32>,
nfa: &'a Nfa,
}
#[derive(Debug, PartialEq, Eq)]
pub struct NfaTransition {
pub characters: CharacterSet,
pub is_separator: bool,
pub precedence: i32,
pub states: Vec<u32>,
}
impl Default for Nfa {
fn default() -> Self {
Self { states: Vec::new() }
}
}
impl CharacterSet {
pub fn empty() -> Self {
CharacterSet::Include(Vec::new())
}
pub fn all() -> Self {
CharacterSet::Exclude(Vec::new())
}
pub fn negate(self) -> CharacterSet {
match self {
CharacterSet::Include(chars) => CharacterSet::Exclude(chars),
CharacterSet::Exclude(chars) => CharacterSet::Include(chars),
}
}
pub fn add_char(self, c: char) -> Self {
if let CharacterSet::Include(mut chars) = self {
if let Err(i) = chars.binary_search(&c) {
chars.insert(i, c);
}
CharacterSet::Include(chars)
} else {
panic!("Called add with a negated character set");
}
}
pub fn add_range(self, start: char, end: char) -> Self {
if let CharacterSet::Include(mut chars) = self {
let mut c = start as u32;
while c <= end as u32 {
chars.push(char::from_u32(c).unwrap());
c += 1;
}
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
} else {
panic!("Called add with a negated character set");
}
}
pub fn add(self, other: &CharacterSet) -> Self {
match self {
CharacterSet::Include(mut chars) => match other {
CharacterSet::Include(other_chars) => {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
}
CharacterSet::Exclude(other_chars) => {
let excluded_chars = other_chars
.iter()
.cloned()
.filter(|c| !chars.contains(&c))
.collect();
CharacterSet::Exclude(excluded_chars)
}
},
CharacterSet::Exclude(mut chars) => match other {
CharacterSet::Include(other_chars) => {
chars.retain(|c| !other_chars.contains(&c));
CharacterSet::Exclude(chars)
}
CharacterSet::Exclude(other_chars) => {
chars.retain(|c| other_chars.contains(&c));
CharacterSet::Exclude(chars)
}
},
}
}
pub fn does_intersect(&self, other: &CharacterSet) -> bool {
match self {
CharacterSet::Include(chars) => match other {
CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).common,
CharacterSet::Exclude(other_chars) => compare_chars(chars, other_chars).left_only,
},
CharacterSet::Exclude(chars) => match other {
CharacterSet::Include(other_chars) => compare_chars(chars, other_chars).right_only,
CharacterSet::Exclude(_) => true,
},
}
}
pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
match self {
CharacterSet::Include(chars) => match other {
CharacterSet::Include(other_chars) => {
CharacterSet::Include(remove_chars(chars, other_chars, true))
}
CharacterSet::Exclude(other_chars) => {
let mut removed = remove_chars(chars, other_chars, false);
add_chars(other_chars, chars);
swap(&mut removed, chars);
CharacterSet::Include(removed)
}
},
CharacterSet::Exclude(chars) => match other {
CharacterSet::Include(other_chars) => {
let mut removed = remove_chars(other_chars, chars, false);
add_chars(chars, other_chars);
swap(&mut removed, other_chars);
CharacterSet::Include(removed)
}
CharacterSet::Exclude(other_chars) => {
let mut result_exclusion = chars.clone();
result_exclusion.extend(other_chars.iter().cloned());
result_exclusion.sort_unstable();
result_exclusion.dedup();
remove_chars(chars, other_chars, true);
let mut included_characters = Vec::new();
let mut other_included_characters = Vec::new();
swap(&mut included_characters, other_chars);
swap(&mut other_included_characters, chars);
*self = CharacterSet::Include(included_characters);
*other = CharacterSet::Include(other_included_characters);
CharacterSet::Exclude(result_exclusion)
}
},
}
}
pub fn is_empty(&self) -> bool {
if let CharacterSet::Include(c) = self {
c.is_empty()
} else {
false
}
}
pub fn contains(&self, c: char) -> bool {
match self {
CharacterSet::Include(chars) => chars.contains(&c),
CharacterSet::Exclude(chars) => !chars.contains(&c),
}
}
}
impl Ord for CharacterSet {
fn cmp(&self, other: &CharacterSet) -> Ordering {
match self {
CharacterSet::Include(chars) => {
if let CharacterSet::Include(other_chars) = other {
order_chars(chars, other_chars)
} else {
Ordering::Less
}
}
CharacterSet::Exclude(chars) => {
if let CharacterSet::Exclude(other_chars) = other {
order_chars(chars, other_chars)
} else {
Ordering::Greater
}
}
}
}
}
impl PartialOrd for CharacterSet {
fn partial_cmp(&self, other: &CharacterSet) -> Option<Ordering> {
Some(self.cmp(other))
}
}
fn add_chars(left: &mut Vec<char>, right: &Vec<char>) {
for c in right {
match left.binary_search(c) {
Err(i) => left.insert(i, *c),
_ => {}
}
}
}
fn remove_chars(left: &mut Vec<char>, right: &mut Vec<char>, mutate_right: bool) -> Vec<char> {
let mut result = Vec::new();
right.retain(|right_char| {
if let Some(index) = left.iter().position(|left_char| *left_char == *right_char) {
left.remove(index);
result.push(*right_char);
false || !mutate_right
} else {
true
}
});
result
}
struct SetComparision {
left_only: bool,
common: bool,
right_only: bool,
}
fn compare_chars(left: &Vec<char>, right: &Vec<char>) -> SetComparision {
let mut result = SetComparision {
left_only: false,
common: false,
right_only: false,
};
let mut left = left.iter().cloned();
let mut right = right.iter().cloned();
let mut i = left.next();
let mut j = right.next();
while let (Some(left_char), Some(right_char)) = (i, j) {
if left_char < right_char {
i = left.next();
result.left_only = true;
} else if left_char > right_char {
j = right.next();
result.right_only = true;
} else {
i = left.next();
j = right.next();
result.common = true;
}
}
result
}
fn order_chars(chars: &Vec<char>, other_chars: &Vec<char>) -> Ordering {
if chars.is_empty() {
if other_chars.is_empty() {
Ordering::Equal
} else {
Ordering::Less
}
} else if other_chars.is_empty() {
Ordering::Greater
} else {
let cmp = chars.len().cmp(&other_chars.len());
if cmp != Ordering::Equal {
return cmp;
}
for (c, other_c) in chars.iter().zip(other_chars.iter()) {
let cmp = c.cmp(other_c);
if cmp != Ordering::Equal {
return cmp;
}
}
Ordering::Equal
}
}
impl Nfa {
pub fn new() -> Self {
Nfa { states: Vec::new() }
}
pub fn last_state_id(&self) -> u32 {
self.states.len() as u32 - 1
}
}
impl fmt::Debug for Nfa {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Nfa {{ states: {{\n")?;
for (i, state) in self.states.iter().enumerate() {
write!(f, " {}: {:?},\n", i, state)?;
}
write!(f, "}} }}")?;
Ok(())
}
}
impl<'a> NfaCursor<'a> {
pub fn new(nfa: &'a Nfa, mut states: Vec<u32>) -> Self {
let mut result = Self {
nfa,
state_ids: Vec::new(),
};
result.add_states(&mut states);
result
}
pub fn reset(&mut self, mut states: Vec<u32>) {
self.state_ids.clear();
self.add_states(&mut states);
}
pub fn force_reset(&mut self, states: Vec<u32>) {
self.state_ids = states
}
pub fn transition_chars(&self) -> impl Iterator<Item = (&CharacterSet, bool)> {
self.raw_transitions().map(|t| (t.0, t.1))
}
pub fn transitions(&self) -> Vec<NfaTransition> {
Self::group_transitions(self.raw_transitions())
}
fn raw_transitions(&self) -> impl Iterator<Item = (&CharacterSet, bool, i32, u32)> {
self.state_ids.iter().filter_map(move |id| {
if let NfaState::Advance {
chars,
state_id,
precedence,
is_sep,
} = &self.nfa.states[*id as usize]
{
Some((chars, *is_sep, *precedence, *state_id))
} else {
None
}
})
}
fn group_transitions<'b>(
iter: impl Iterator<Item = (&'b CharacterSet, bool, i32, u32)>,
) -> Vec<NfaTransition> {
let mut result: Vec<NfaTransition> = Vec::new();
for (chars, is_sep, prec, state) in iter {
let mut chars = chars.clone();
let mut i = 0;
while i < result.len() && !chars.is_empty() {
let intersection = result[i].characters.remove_intersection(&mut chars);
if !intersection.is_empty() {
let mut intersection_states = result[i].states.clone();
match intersection_states.binary_search(&state) {
Err(j) => intersection_states.insert(j, state),
_ => {}
}
let intersection_transition = NfaTransition {
characters: intersection,
is_separator: result[i].is_separator || is_sep,
precedence: max(result[i].precedence, prec),
states: intersection_states,
};
if result[i].characters.is_empty() {
result[i] = intersection_transition;
} else {
result.insert(i, intersection_transition);
i += 1;
}
}
i += 1;
}
if !chars.is_empty() {
result.push(NfaTransition {
characters: chars,
precedence: prec,
states: vec![state],
is_separator: is_sep,
});
}
}
result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters));
result
}
pub fn completions(&self) -> impl Iterator<Item = (usize, i32)> + '_ {
self.state_ids.iter().filter_map(move |state_id| {
if let NfaState::Accept {
variable_index,
precedence,
} = self.nfa.states[*state_id as usize]
{
Some((variable_index, precedence))
} else {
None
}
})
}
pub fn add_states(&mut self, new_state_ids: &mut Vec<u32>) {
let mut i = 0;
while i < new_state_ids.len() {
let state_id = new_state_ids[i];
let state = &self.nfa.states[state_id as usize];
if let NfaState::Split(left, right) = state {
let mut has_left = false;
let mut has_right = false;
for new_state_id in new_state_ids.iter() {
if *new_state_id == *left {
has_left = true;
}
if *new_state_id == *right {
has_right = true;
}
}
if !has_left {
new_state_ids.push(*left);
}
if !has_right {
new_state_ids.push(*right);
}
} else if let Err(i) = self.state_ids.binary_search(&state_id) {
self.state_ids.insert(i, state_id);
}
i += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_transitions() {
let table = [
// overlapping character classes
(
vec![
(CharacterSet::empty().add_range('a', 'f'), false, 0, 1),
(CharacterSet::empty().add_range('d', 'i'), false, 1, 2),
],
vec![
NfaTransition {
characters: CharacterSet::empty().add_range('a', 'c'),
is_separator: false,
precedence: 0,
states: vec![1],
},
NfaTransition {
characters: CharacterSet::empty().add_range('d', 'f'),
is_separator: false,
precedence: 1,
states: vec![1, 2],
},
NfaTransition {
characters: CharacterSet::empty().add_range('g', 'i'),
is_separator: false,
precedence: 1,
states: vec![2],
},
],
),
// large character class followed by many individual characters
(
vec![
(CharacterSet::empty().add_range('a', 'z'), false, 0, 1),
(CharacterSet::empty().add_char('d'), false, 0, 2),
(CharacterSet::empty().add_char('i'), false, 0, 3),
(CharacterSet::empty().add_char('f'), false, 0, 4),
],
vec![
NfaTransition {
characters: CharacterSet::empty().add_char('d'),
is_separator: false,
precedence: 0,
states: vec![1, 2],
},
NfaTransition {
characters: CharacterSet::empty().add_char('f'),
is_separator: false,
precedence: 0,
states: vec![1, 4],
},
NfaTransition {
characters: CharacterSet::empty().add_char('i'),
is_separator: false,
precedence: 0,
states: vec![1, 3],
},
NfaTransition {
characters: CharacterSet::empty()
.add_range('a', 'c')
.add_char('e')
.add_range('g', 'h')
.add_range('j', 'z'),
is_separator: false,
precedence: 0,
states: vec![1],
},
],
),
// negated character class followed by an individual character
(
vec![
(CharacterSet::empty().add_char('0'), false, 0, 1),
(CharacterSet::empty().add_char('b'), false, 0, 2),
(
CharacterSet::empty().add_range('a', 'f').negate(),
false,
0,
3,
),
(CharacterSet::empty().add_char('c'), false, 0, 4),
],
vec![
NfaTransition {
characters: CharacterSet::empty().add_char('0'),
precedence: 0,
states: vec![1, 3],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::empty().add_char('b'),
precedence: 0,
states: vec![2],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::empty().add_char('c'),
precedence: 0,
states: vec![4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::empty()
.add_range('a', 'f')
.add_char('0')
.negate(),
precedence: 0,
states: vec![3],
is_separator: false,
},
],
),
// multiple negated character classes
(
vec![
(CharacterSet::Include(vec!['a']), false, 0, 1),
(CharacterSet::Exclude(vec!['a', 'b', 'c']), false, 0, 2),
(CharacterSet::Include(vec!['g']), false, 0, 6),
(CharacterSet::Exclude(vec!['d', 'e', 'f']), false, 0, 3),
(CharacterSet::Exclude(vec!['g', 'h', 'i']), false, 0, 4),
(CharacterSet::Include(vec!['g']), false, 0, 5),
],
vec![
NfaTransition {
characters: CharacterSet::Include(vec!['a']),
precedence: 0,
states: vec![1, 3, 4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['g']),
precedence: 0,
states: vec![2, 3, 5, 6],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['b', 'c']),
precedence: 0,
states: vec![3, 4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['h', 'i']),
precedence: 0,
states: vec![2, 3],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Include(vec!['d', 'e', 'f']),
precedence: 0,
states: vec![2, 4],
is_separator: false,
},
NfaTransition {
characters: CharacterSet::Exclude(vec![
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
]),
precedence: 0,
states: vec![2, 3, 4],
is_separator: false,
},
],
),
];
for row in table.iter() {
assert_eq!(
NfaCursor::group_transitions(row.0.iter().map(|(c, sep, p, s)| (c, *sep, *p, *s))),
row.1
);
}
}
#[test]
fn test_character_set_remove_intersection() {
// A whitelist and an overlapping whitelist.
// Both sets contain 'c', 'd', and 'f'
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
let mut a = CharacterSet::empty().add_range('a', 'f');
let mut b = CharacterSet::empty().add_range('c', 'h');
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::empty().add_range('c', 'f')
);
assert_eq!(a, CharacterSet::empty().add_range('a', 'b'));
assert_eq!(b, CharacterSet::empty().add_range('g', 'h'));
// A whitelist and a larger whitelist.
let mut a = CharacterSet::empty().add_char('c');
let mut b = CharacterSet::empty().add_range('a', 'e');
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_char('c')
);
assert_eq!(a, CharacterSet::empty());
assert_eq!(
b,
CharacterSet::empty()
.add_range('a', 'b')
.add_range('d', 'e')
);
let mut a = CharacterSet::empty().add_char('c');
let mut b = CharacterSet::empty().add_range('a', 'e');
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::empty().add_char('c')
);
assert_eq!(a, CharacterSet::empty());
assert_eq!(
b,
CharacterSet::empty()
.add_range('a', 'b')
.add_range('d', 'e')
);
// A whitelist and an intersecting blacklist.
// Both sets contain 'e', 'f', and 'm'
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
let mut a = CharacterSet::empty()
.add_range('c', 'h')
.add_range('k', 'm');
let mut b = CharacterSet::empty()
.add_range('a', 'd')
.add_range('g', 'l')
.negate();
assert_eq!(
b.remove_intersection(&mut a),
CharacterSet::Include(vec!['e', 'f', 'm'])
);
assert_eq!(a, CharacterSet::Include(vec!['c', 'd', 'g', 'h', 'k', 'l']));
assert_eq!(b, CharacterSet::empty().add_range('a', 'm').negate());
// A blacklist and an overlapping blacklist.
// Both sets exclude 'c', 'd', and 'e'
let mut a = CharacterSet::empty().add_range('a', 'e').negate();
let mut b = CharacterSet::empty().add_range('c', 'h').negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('a', 'h').negate(),
);
assert_eq!(a, CharacterSet::Include(vec!['f', 'g', 'h']));
assert_eq!(b, CharacterSet::Include(vec!['a', 'b']));
// A blacklist and a larger blacklist.
let mut a = CharacterSet::empty().add_range('b', 'c').negate();
let mut b = CharacterSet::empty().add_range('a', 'd').negate();
assert_eq!(
a.remove_intersection(&mut b),
CharacterSet::empty().add_range('a', 'd').negate(),
);
assert_eq!(a, CharacterSet::empty().add_char('a').add_char('d'));
assert_eq!(b, CharacterSet::empty());
}
#[test]
fn test_character_set_does_intersect() {
let (a, b) = (CharacterSet::empty(), CharacterSet::empty());
assert!(!a.does_intersect(&b));
assert!(!b.does_intersect(&a));
let (a, b) = (
CharacterSet::empty().add_char('a'),
CharacterSet::empty().add_char('a'),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
let (a, b) = (
CharacterSet::empty().add_char('b'),
CharacterSet::empty().add_char('a').add_char('c'),
);
assert!(!a.does_intersect(&b));
assert!(!b.does_intersect(&a));
let (a, b) = (
CharacterSet::Include(vec!['b']),
CharacterSet::Exclude(vec!['a', 'b', 'c']),
);
assert!(!a.does_intersect(&b));
assert!(!b.does_intersect(&a));
let (a, b) = (
CharacterSet::Include(vec!['b']),
CharacterSet::Exclude(vec!['a', 'c']),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
let (a, b) = (
CharacterSet::Exclude(vec!['a']),
CharacterSet::Exclude(vec!['a']),
);
assert!(a.does_intersect(&b));
assert!(b.does_intersect(&a));
}
}

167
cli/src/parse_grammar.rs Normal file
View file

@ -0,0 +1,167 @@
use serde_json::{Map, Value};
use crate::error::Result;
use crate::grammars::{InputGrammar, Variable, VariableType};
use crate::rules::Rule;
#[derive(Deserialize)]
#[serde(tag = "type")]
#[allow(non_camel_case_types)]
enum RuleJSON {
ALIAS {
content: Box<RuleJSON>,
named: bool,
value: String,
},
BLANK,
STRING {
value: String,
},
PATTERN {
value: String,
},
SYMBOL {
name: String,
},
CHOICE {
members: Vec<RuleJSON>,
},
SEQ {
members: Vec<RuleJSON>,
},
REPEAT {
content: Box<RuleJSON>,
},
REPEAT1 {
content: Box<RuleJSON>,
},
PREC_DYNAMIC {
value: i32,
content: Box<RuleJSON>,
},
PREC_LEFT {
value: i32,
content: Box<RuleJSON>,
},
PREC_RIGHT {
value: i32,
content: Box<RuleJSON>,
},
PREC {
value: i32,
content: Box<RuleJSON>,
},
TOKEN {
content: Box<RuleJSON>,
},
IMMEDIATE_TOKEN {
content: Box<RuleJSON>,
},
}
#[derive(Deserialize)]
struct GrammarJSON {
name: String,
rules: Map<String, Value>,
conflicts: Option<Vec<Vec<String>>>,
externals: Option<Vec<RuleJSON>>,
extras: Option<Vec<RuleJSON>>,
inline: Option<Vec<String>>,
word: Option<String>,
}
pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
let grammar_json: GrammarJSON = serde_json::from_str(&input)?;
let mut variables = Vec::with_capacity(grammar_json.rules.len());
for (name, value) in grammar_json.rules {
variables.push(Variable {
name: name.to_owned(),
kind: VariableType::Named,
rule: parse_rule(serde_json::from_value(value)?),
})
}
let extra_tokens = grammar_json.extras
.unwrap_or(Vec::new())
.into_iter()
.map(parse_rule)
.collect();
let external_tokens = grammar_json.externals
.unwrap_or(Vec::new())
.into_iter()
.map(parse_rule)
.collect();
let expected_conflicts = grammar_json.conflicts
.unwrap_or(Vec::new());
let variables_to_inline = grammar_json.inline
.unwrap_or(Vec::new());
Ok(InputGrammar {
name: grammar_json.name,
word_token: grammar_json.word,
variables,
extra_tokens,
expected_conflicts,
external_tokens,
variables_to_inline,
})
}
fn parse_rule(json: RuleJSON) -> Rule {
match json {
RuleJSON::ALIAS { content, value, named } => Rule::alias(parse_rule(*content), value, named),
RuleJSON::BLANK => Rule::Blank,
RuleJSON::STRING { value } => Rule::String(value),
RuleJSON::PATTERN { value } => Rule::Pattern(value),
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()),
RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)),
RuleJSON::REPEAT { content } => Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank]),
RuleJSON::PREC { value, content } => Rule::prec(value, parse_rule(*content)),
RuleJSON::PREC_LEFT { value, content } => Rule::prec_left(value, parse_rule(*content)),
RuleJSON::PREC_RIGHT { value, content } => Rule::prec_right(value, parse_rule(*content)),
RuleJSON::PREC_DYNAMIC { value, content } => Rule::prec_dynamic(value, parse_rule(*content)),
RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)),
RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_grammar() {
let grammar = parse_grammar(r#"{
"name": "my_lang",
"rules": {
"file": {
"type": "REPEAT1",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "STRING",
"value": "foo"
}
}
}"#).unwrap();
assert_eq!(grammar.name, "my_lang");
assert_eq!(grammar.variables, vec![
Variable {
name: "file".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
},
Variable {
name: "statement".to_string(),
kind: VariableType::Named,
rule: Rule::String("foo".to_string())
},
]);
}
}

View file

@ -0,0 +1,241 @@
use super::ExtractedSyntaxGrammar;
use crate::grammars::{Variable, VariableType};
use crate::rules::{Rule, Symbol};
use hashbrown::HashMap;
use std::mem;
struct Expander {
variable_name: String,
repeat_count_in_variable: usize,
preceding_symbol_count: usize,
auxiliary_variables: Vec<Variable>,
existing_repeats: HashMap<Rule, Symbol>,
}
impl Expander {
fn expand_variable(&mut self, variable: &mut Variable) {
self.variable_name.clear();
self.variable_name.push_str(&variable.name);
self.repeat_count_in_variable = 0;
let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
variable.rule = self.expand_rule(&rule);
}
fn expand_rule(&mut self, rule: &Rule) -> Rule {
match rule {
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|element| self.expand_rule(element))
.collect(),
),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|element| self.expand_rule(element))
.collect(),
),
Rule::Repeat(content) => {
let inner_rule = self.expand_rule(content);
if let Some(existing_symbol) = self.existing_repeats.get(&inner_rule) {
return Rule::Symbol(*existing_symbol);
}
self.repeat_count_in_variable += 1;
let rule_name = format!(
"{}_repeat{}",
self.variable_name, self.repeat_count_in_variable
);
let repeat_symbol = Symbol::non_terminal(
self.preceding_symbol_count + self.auxiliary_variables.len(),
);
self.existing_repeats
.insert(inner_rule.clone(), repeat_symbol);
self.auxiliary_variables.push(Variable {
name: rule_name,
kind: VariableType::Auxiliary,
rule: Rule::Choice(vec![
Rule::Seq(vec![
Rule::Symbol(repeat_symbol),
Rule::Symbol(repeat_symbol),
]),
inner_rule,
]),
});
Rule::Symbol(repeat_symbol)
}
Rule::Metadata { rule, params } => Rule::Metadata {
rule: Box::new(self.expand_rule(rule)),
params: params.clone(),
},
_ => rule.clone(),
}
}
}
pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSyntaxGrammar {
let mut expander = Expander {
variable_name: String::new(),
repeat_count_in_variable: 0,
preceding_symbol_count: grammar.variables.len(),
auxiliary_variables: Vec::new(),
existing_repeats: HashMap::new(),
};
for mut variable in grammar.variables.iter_mut() {
expander.expand_variable(&mut variable);
}
grammar
.variables
.extend(expander.auxiliary_variables.into_iter());
grammar
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_repeat_expansion() {
// Repeats nested inside of sequences and choices are expanded.
let grammar = expand_repeats(build_grammar(vec![Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::choice(vec![
Rule::repeat(Rule::terminal(11)),
Rule::repeat(Rule::terminal(12)),
]),
Rule::terminal(13),
]),
)]));
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]),
Rule::terminal(13),
])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]),
Rule::terminal(11),
])
),
Variable::auxiliary(
"rule0_repeat2",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::terminal(12),
])
),
]
);
}
#[test]
fn test_repeat_deduplication() {
// Terminal 4 appears inside of a repeat in three different places.
let grammar = expand_repeats(build_grammar(vec![
Variable::named(
"rule0",
Rule::choice(vec![
Rule::seq(vec![Rule::terminal(1), Rule::repeat(Rule::terminal(4))]),
Rule::seq(vec![Rule::terminal(2), Rule::repeat(Rule::terminal(4))]),
]),
),
Variable::named(
"rule1",
Rule::seq(vec![Rule::terminal(3), Rule::repeat(Rule::terminal(4))]),
),
]));
// Only one auxiliary rule is created for repeating terminal 4.
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::choice(vec![
Rule::seq(vec![Rule::terminal(1), Rule::non_terminal(2)]),
Rule::seq(vec![Rule::terminal(2), Rule::non_terminal(2)]),
])
),
Variable::named(
"rule1",
Rule::seq(vec![Rule::terminal(3), Rule::non_terminal(2),])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::terminal(4),
])
)
]
);
}
#[test]
fn test_expansion_of_nested_repeats() {
let grammar = expand_repeats(build_grammar(vec![Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::repeat(Rule::seq(vec![
Rule::terminal(11),
Rule::repeat(Rule::terminal(12)),
])),
]),
)]));
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::seq(vec![Rule::terminal(10), Rule::non_terminal(2),])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]),
Rule::terminal(12),
])
),
Variable::auxiliary(
"rule0_repeat2",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::seq(vec![Rule::terminal(11), Rule::non_terminal(1),]),
])
),
]
);
}
fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
ExtractedSyntaxGrammar {
variables,
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
}
}
}

View file

@ -0,0 +1,611 @@
use super::ExtractedLexicalGrammar;
use crate::error::{Error, Result};
use crate::grammars::{LexicalGrammar, LexicalVariable};
use crate::nfa::{CharacterSet, Nfa, NfaState};
use crate::rules::Rule;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
};
use std::i32;
struct NfaBuilder {
nfa: Nfa,
is_sep: bool,
precedence_stack: Vec<i32>,
}
fn get_implicit_precedence(rule: &Rule) -> i32 {
match rule {
Rule::String(_) => 1,
Rule::Metadata { rule, params } => {
if params.is_main_token {
get_implicit_precedence(rule) + 2
} else {
get_implicit_precedence(rule)
}
}
_ => 0,
}
}
fn get_completion_precedence(rule: &Rule) -> i32 {
match rule {
Rule::Metadata { params, .. } => params.precedence.unwrap_or(0),
_ => 0,
}
}
pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder {
nfa: Nfa::new(),
is_sep: true,
precedence_stack: vec![0],
};
let separator_rule = if grammar.separators.len() > 0 {
grammar.separators.push(Rule::Blank);
Rule::repeat(Rule::choice(grammar.separators))
} else {
Rule::Blank
};
let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() {
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,
};
builder.is_sep = false;
builder.nfa.states.push(NfaState::Accept {
variable_index: i,
precedence: get_completion_precedence(&variable.rule),
});
let last_state_id = builder.nfa.last_state_id();
builder
.expand_rule(&variable.rule, last_state_id)
.map_err(|Error(msg)| Error(format!("Rule {} {}", variable.name, msg)))?;
if !is_immediate_token {
builder.is_sep = true;
let last_state_id = builder.nfa.last_state_id();
builder.expand_rule(&separator_rule, last_state_id)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
implicit_precedence: get_implicit_precedence(&variable.rule),
start_state: builder.nfa.last_state_id(),
});
}
Ok(LexicalGrammar {
nfa: builder.nfa,
variables,
})
}
impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let ast = parse::Parser::new()
.parse(&s)
.map_err(|e| Error(e.to_string()))?;
self.expand_regex(&ast, next_state_id)
}
Rule::String(s) => {
for c in s.chars().rev() {
self.push_advance(CharacterSet::empty().add_char(c), next_state_id);
next_state_id = self.nfa.last_state_id();
}
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if self.expand_rule(element, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id);
}
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if self.expand_rule(element, next_state_id)? {
result = true;
}
next_state_id = self.nfa.last_state_id();
}
Ok(result)
}
Rule::Repeat(rule) => {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_rule(rule, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
Rule::Metadata { rule, params } => {
if let Some(precedence) = params.precedence {
self.precedence_stack.push(precedence);
}
let result = self.expand_rule(rule, next_state_id);
if params.precedence.is_some() {
self.precedence_stack.pop();
}
result
}
Rule::Blank => Ok(false),
_ => Err(Error::grammar(&format!("Unexpected rule {:?}", rule))),
}
}
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
Ast::Literal(literal) => {
self.push_advance(CharacterSet::Include(vec![literal.c]), next_state_id);
Ok(true)
}
Ast::Dot(_) => {
self.push_advance(CharacterSet::Exclude(vec!['\n']), next_state_id);
Ok(true)
}
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
Ast::Class(class) => match class {
Class::Unicode(_) => {
Err(Error::regex("Unicode character classes are not supported"))
}
Class::Perl(class) => {
let mut chars = self.expand_perl_character_class(&class.kind);
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
let mut chars = self.expand_character_class(&item)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
ClassSet::BinaryOp(_) => Err(Error::regex(
"Binary operators in character classes aren't supported",
)),
},
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
self.expand_count(&repetition.ast, min, next_state_id)
} else {
Ok(false)
}
}
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
for _ in min..max {
if result {
next_state_id = self.nfa.last_state_id();
}
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
result = true;
}
}
Ok(result)
}
},
Ast::Group(group) => self.expand_regex(&group.ast, self.nfa.last_state_id()),
Ast::Alternation(alternation) => {
let mut alternative_state_ids = Vec::new();
for ast in alternation.asts.iter() {
if self.expand_regex(&ast, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id);
}
Ok(true)
}
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(&ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_regex(&ast, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
self.nfa.states.pop();
Ok(false)
}
}
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_regex(ast, next_state_id)? {
self.push_split(next_state_id);
Ok(true)
} else {
Ok(false)
}
}
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_one_or_more(&ast, next_state_id)? {
self.push_split(next_state_id);
Ok(true)
} else {
Ok(false)
}
}
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
let mut result = false;
for _ in 0..count {
if self.expand_regex(ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::Include(Vec::new())),
ClassSetItem::Literal(literal) => Ok(CharacterSet::Include(vec![literal.c])),
ClassSetItem::Range(range) => {
Ok(CharacterSet::empty().add_range(range.start.c, range.end.c))
}
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(&self.expand_character_class(&item)?);
}
Ok(result)
}
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
_ => Err(Error::regex(&format!(
"Unsupported character class syntax {:?}",
item
))),
}
}
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::empty().add_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
}
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
let precedence = *self.precedence_stack.last().unwrap();
self.nfa.states.push(NfaState::Advance {
chars,
state_id,
precedence,
is_sep: self.is_sep,
});
}
fn push_split(&mut self, state_id: u32) {
let last_state_id = self.nfa.last_state_id();
self.nfa
.states
.push(NfaState::Split(state_id, last_state_id));
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::Variable;
use crate::nfa::{NfaCursor, NfaTransition};
fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> {
let start_states = grammar.variables.iter().map(|v| v.start_state).collect();
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
let mut result = None;
let mut result_precedence = i32::MIN;
let mut start_char = 0;
let mut end_char = 0;
for c in s.chars() {
for (id, precedence) in cursor.completions() {
if result.is_none() || result_precedence <= precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = precedence;
}
}
if let Some(NfaTransition {
states,
is_separator,
..
}) = cursor
.transitions()
.into_iter()
.find(|t| t.characters.contains(c) && t.precedence >= result_precedence)
{
cursor.reset(states);
end_char += 1;
if is_separator {
start_char = end_char;
}
} else {
break;
}
}
for (id, precedence) in cursor.completions() {
if result.is_none() || result_precedence <= precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = precedence;
}
}
result
}
#[test]
fn test_rule_expansion() {
struct Row {
rules: Vec<Rule>,
separators: Vec<Rule>,
examples: Vec<(&'static str, Option<(usize, &'static str)>)>,
}
let table = [
// regex with sequences and alternatives
Row {
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
separators: vec![],
examples: vec![
("ade1", Some((0, "ade"))),
("bdf1", Some((0, "bdf"))),
("bdfh1", Some((0, "bdfh"))),
("ad1", None),
],
},
// regex with repeats
Row {
rules: vec![Rule::pattern("a*")],
separators: vec![],
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
},
// regex with repeats in sequences
Row {
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
separators: vec![],
examples: vec![
("af1", Some((0, "af"))),
("adedef1", Some((0, "adedef"))),
("abcbcbcf1", Some((0, "abcbcbcf"))),
("a", None),
],
},
// regex with character ranges
Row {
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
separators: vec![],
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
},
// regex with perl character classes
Row {
rules: vec![Rule::pattern("\\w\\d\\s")],
separators: vec![],
examples: vec![("_0 ", Some((0, "_0 ")))],
},
// string
Row {
rules: vec![Rule::string("abc")],
separators: vec![],
examples: vec![("abcd", Some((0, "abc"))), ("ab", None)],
},
// complex rule containing strings and regexes
Row {
rules: vec![Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
]))],
separators: vec![],
examples: vec![
("{a}{", Some((0, "{a}"))),
("{a}{d", Some((0, "{a}"))),
("ab", None),
],
},
// longest match rule
Row {
rules: vec![
Rule::pattern("a|bc"),
Rule::pattern("aa"),
Rule::pattern("bcd"),
],
separators: vec![],
examples: vec![
("a.", Some((0, "a"))),
("bc.", Some((0, "bc"))),
("aa.", Some((1, "aa"))),
("bcd?", Some((2, "bcd"))),
("b.", None),
("c.", None),
],
},
// regex with an alternative including the empty string
Row {
rules: vec![Rule::pattern("a(b|)+c")],
separators: vec![],
examples: vec![
("ac.", Some((0, "ac"))),
("abc.", Some((0, "abc"))),
("abbc.", Some((0, "abbc"))),
],
},
// separators
Row {
rules: vec![Rule::pattern("[a-f]+")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
(" a", Some((0, "a"))),
(" \nb", Some((0, "b"))),
(" \\a", None),
(" \\\na", Some((0, "a"))),
],
},
// shorter tokens with higher precedence
Row {
rules: vec![
Rule::prec(2, Rule::pattern("abc")),
Rule::prec(1, Rule::pattern("ab[cd]e")),
Rule::pattern("[a-e]+"),
],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
("abceef", Some((0, "abc"))),
("abdeef", Some((1, "abde"))),
("aeeeef", Some((2, "aeeee"))),
],
},
// immediate tokens with higher precedence
Row {
rules: vec![
Rule::prec(1, Rule::pattern("[^a]+")),
Rule::immediate_token(Rule::prec(2, Rule::pattern("[^ab]+"))),
],
separators: vec![Rule::pattern("\\s")],
examples: vec![("cccb", Some((1, "ccc")))],
},
Row {
rules: vec![Rule::seq(vec![
Rule::string("a"),
Rule::choice(vec![Rule::string("b"), Rule::string("c")]),
Rule::string("d"),
])],
separators: vec![],
examples: vec![
("abd", Some((0, "abd"))),
("acd", Some((0, "acd"))),
("abc", None),
("ad", None),
("d", None),
("a", None),
],
},
// nested choices within sequences
Row {
rules: vec![Rule::seq(vec![
Rule::pattern("[0-9]+"),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::seq(vec![
Rule::choice(vec![Rule::string("e"), Rule::string("E")]),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::string("+"), Rule::string("-")]),
]),
Rule::pattern("[0-9]+"),
])]),
]),
])],
separators: vec![],
examples: vec![
("12", Some((0, "12"))),
("12e", Some((0, "12"))),
("12g", Some((0, "12"))),
("12e3", Some((0, "12e3"))),
("12e+", Some((0, "12"))),
("12E+34 +", Some((0, "12E+34"))),
("12e34", Some((0, "12e34"))),
],
},
];
for Row {
rules,
separators,
examples,
} in &table
{
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: separators.clone(),
variables: rules
.into_iter()
.map(|rule| Variable::named("", rule.clone()))
.collect(),
})
.unwrap();
for (haystack, needle) in examples.iter() {
assert_eq!(simulate_nfa(&grammar, haystack), *needle);
}
}
}
}

View file

@ -0,0 +1,199 @@
use crate::rules::{Alias, AliasMap, Symbol, SymbolType};
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
#[derive(Clone, Default)]
struct SymbolStatus {
alias: Option<Alias>,
conflicting: bool,
}
pub(super) fn extract_simple_aliases(
syntax_grammar: &mut SyntaxGrammar,
lexical_grammar: &LexicalGrammar
) -> AliasMap {
// Determine which symbols in the grammars are *always* aliased to a single name.
let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
let mut non_terminal_status_list = vec![SymbolStatus::default(); syntax_grammar.variables.len()];
let mut external_status_list = vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
for variable in syntax_grammar.variables.iter() {
for production in variable.productions.iter() {
for step in production.steps.iter() {
let mut status = match step.symbol {
Symbol { kind: SymbolType::External, index} => &mut external_status_list[index],
Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index],
Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index],
Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"),
};
if step.alias.is_none() {
status.alias = None;
status.conflicting = true;
}
if !status.conflicting {
if status.alias.is_none() {
status.alias = step.alias.clone();
} else if status.alias != step.alias {
status.alias = None;
status.conflicting = true;
}
}
}
}
}
// Remove the aliases for those symbols.
for variable in syntax_grammar.variables.iter_mut() {
for production in variable.productions.iter_mut() {
for step in production.steps.iter_mut() {
let status = match step.symbol {
Symbol { kind: SymbolType::External, index} => &external_status_list[index],
Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index],
Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index],
Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"),
};
if status.alias.is_some() {
step.alias = None;
}
}
}
}
// Populate a map of the symbols to their aliases.
let mut result = AliasMap::new();
for (i, status) in terminal_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::terminal(i), alias);
}
}
for (i, status) in non_terminal_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::non_terminal(i), alias);
}
}
for (i, status) in external_status_list.into_iter().enumerate() {
if let Some(alias) = status.alias {
result.insert(Symbol::external(i), alias);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{LexicalVariable, SyntaxVariable, VariableType, Production, ProductionStep};
use crate::nfa::Nfa;
#[test]
fn test_extract_simple_aliases() {
let mut syntax_grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
],
},
],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
// Token 0 is always aliased as "a1".
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
// Token 1 is aliased above, but not here.
ProductionStep::new(Symbol::terminal(1)),
// Token 2 is aliased differently than above.
ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
],
},
],
},
],
extra_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let lexical_grammar = LexicalGrammar {
nfa: Nfa::new(),
variables: vec![
LexicalVariable {
name: "t1".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t2".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t3".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
}
],
};
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
assert_eq!(simple_aliases.len(), 1);
assert_eq!(simple_aliases[&Symbol::terminal(0)], Alias {
value: "a1".to_string(),
is_named: true,
});
assert_eq!(syntax_grammar.variables, vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
// 'Simple' alias removed
ProductionStep::new(Symbol::terminal(0)),
// Other aliases unchanged
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
],
},
],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)),
ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true),
],
},
],
},
]);
}
}

View file

@ -0,0 +1,525 @@
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
use crate::error::{Error, Result};
use crate::grammars::{ExternalToken, Variable, VariableType};
use crate::rules::{MetadataParams, Rule, Symbol, SymbolType};
use hashbrown::HashMap;
use std::mem;
pub(super) fn extract_tokens(
mut grammar: InternedGrammar,
) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> {
let mut extractor = TokenExtractor {
current_variable_name: String::new(),
current_variable_token_count: 0,
extracted_variables: Vec::new(),
extracted_usage_counts: Vec::new(),
};
for mut variable in grammar.variables.iter_mut() {
extractor.extract_tokens_in_variable(&mut variable);
}
for mut variable in grammar.external_tokens.iter_mut() {
extractor.extract_tokens_in_variable(&mut variable);
}
let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len());
for variable in extractor.extracted_variables {
lexical_variables.push(Variable {
name: variable.name,
kind: variable.kind,
rule: variable.rule,
});
}
// If a variable's entire rule was extracted as a token and that token didn't
// appear within any other rule, then remove that variable from the syntax
// grammar, giving its name to the token in the lexical grammar. Any symbols
// that pointed to that variable will need to be updated to point to the
// variable in the lexical grammar. Symbols that pointed to later variables
// will need to have their indices decremented.
let mut variables = Vec::new();
let mut symbol_replacer = SymbolReplacer {
replacements: HashMap::new(),
};
for (i, variable) in grammar.variables.into_iter().enumerate() {
if let Rule::Symbol(Symbol {
kind: SymbolType::Terminal,
index,
}) = variable.rule
{
if i > 0 && extractor.extracted_usage_counts[index] == 1 {
let mut lexical_variable = &mut lexical_variables[index];
lexical_variable.kind = variable.kind;
lexical_variable.name = variable.name;
symbol_replacer.replacements.insert(i, index);
continue;
}
}
variables.push(variable);
}
for variable in variables.iter_mut() {
variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule);
}
let expected_conflicts = grammar
.expected_conflicts
.into_iter()
.map(|conflict| {
let mut result: Vec<_> = conflict
.iter()
.map(|symbol| symbol_replacer.replace_symbol(*symbol))
.collect();
result.sort_unstable();
result.dedup();
result
})
.collect();
let variables_to_inline = grammar
.variables_to_inline
.into_iter()
.map(|symbol| symbol_replacer.replace_symbol(symbol))
.collect();
let mut separators = Vec::new();
let mut extra_tokens = Vec::new();
for rule in grammar.extra_tokens {
if let Rule::Symbol(symbol) = rule {
let new_symbol = symbol_replacer.replace_symbol(symbol);
if new_symbol.is_non_terminal() {
return Err(Error(format!(
"Non-token symbol '{}' cannot be used as an extra token",
&variables[new_symbol.index].name
)));
} else {
extra_tokens.push(new_symbol);
}
} else {
if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
extra_tokens.push(Symbol::terminal(index));
} else {
separators.push(rule);
}
}
}
let mut external_tokens = Vec::new();
for external_token in grammar.external_tokens {
let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule);
if let Rule::Symbol(symbol) = rule {
if symbol.is_non_terminal() {
return Err(Error(format!(
"Rule '{}' cannot be used as both an external token and a non-terminal rule",
&variables[symbol.index].name,
)));
}
if symbol.is_external() {
external_tokens.push(ExternalToken {
name: external_token.name,
kind: external_token.kind,
corresponding_internal_token: None,
})
} else {
external_tokens.push(ExternalToken {
name: lexical_variables[symbol.index].name.clone(),
kind: external_token.kind,
corresponding_internal_token: Some(symbol),
})
}
} else {
return Err(Error(format!(
"Non-symbol rules cannot be used as external tokens"
)));
}
}
let mut word_token = None;
if let Some(token) = grammar.word_token {
let token = symbol_replacer.replace_symbol(token);
if token.is_non_terminal() {
return Err(Error(format!(
"Non-terminal symbol '{}' cannot be used as the word token",
&variables[token.index].name
)));
}
word_token = Some(token);
}
Ok((
ExtractedSyntaxGrammar {
variables,
expected_conflicts,
extra_tokens,
variables_to_inline,
external_tokens,
word_token,
},
ExtractedLexicalGrammar {
variables: lexical_variables,
separators,
},
))
}
struct TokenExtractor {
current_variable_name: String,
current_variable_token_count: usize,
extracted_variables: Vec<Variable>,
extracted_usage_counts: Vec<usize>,
}
struct SymbolReplacer {
replacements: HashMap<usize, usize>,
}
impl TokenExtractor {
fn extract_tokens_in_variable(&mut self, variable: &mut Variable) {
self.current_variable_name.clear();
self.current_variable_name.push_str(&variable.name);
self.current_variable_token_count = 0;
let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
variable.rule = self.extract_tokens_in_rule(&rule);
}
fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule {
match input {
Rule::String(name) => self.extract_token(input, Some(name)).into(),
Rule::Pattern(..) => self.extract_token(input, None).into(),
Rule::Metadata { params, rule } => {
if params.is_token {
let mut params = params.clone();
params.is_token = false;
let mut string_value = None;
if let Rule::String(value) = rule.as_ref() {
string_value = Some(value);
}
let rule_to_extract = if params == MetadataParams::default() {
rule.as_ref()
} else {
input
};
self.extract_token(rule_to_extract, string_value).into()
} else {
Rule::Metadata {
params: params.clone(),
rule: Box::new(self.extract_tokens_in_rule((&rule).clone())),
}
}
}
Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|e| self.extract_tokens_in_rule(e))
.collect(),
),
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|e| self.extract_tokens_in_rule(e))
.collect(),
),
_ => input.clone(),
}
}
fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol {
for (i, variable) in self.extracted_variables.iter_mut().enumerate() {
if variable.rule == *rule {
self.extracted_usage_counts[i] += 1;
return Symbol::terminal(i);
}
}
let index = self.extracted_variables.len();
let variable = if let Some(string_value) = string_value {
Variable {
name: string_value.clone(),
kind: VariableType::Anonymous,
rule: rule.clone()
}
} else {
self.current_variable_token_count += 1;
Variable {
name: format!(
"{}_token{}",
&self.current_variable_name, self.current_variable_token_count
),
kind: VariableType::Auxiliary,
rule: rule.clone(),
}
};
self.extracted_variables.push(variable);
self.extracted_usage_counts.push(1);
Symbol::terminal(index)
}
}
impl SymbolReplacer {
fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule {
match rule {
Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(),
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|e| self.replace_symbols_in_rule(e))
.collect(),
),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|e| self.replace_symbols_in_rule(e))
.collect(),
),
Rule::Repeat(content) => Rule::Repeat(Box::new(self.replace_symbols_in_rule(content))),
Rule::Metadata { rule, params } => Rule::Metadata {
params: params.clone(),
rule: Box::new(self.replace_symbols_in_rule(rule)),
},
_ => rule.clone(),
}
}
fn replace_symbol(&self, symbol: Symbol) -> Symbol {
if !symbol.is_non_terminal() {
return symbol;
}
if let Some(replacement) = self.replacements.get(&symbol.index) {
return Symbol::terminal(*replacement);
}
let mut adjusted_index = symbol.index;
for (replaced_index, _) in self.replacements.iter() {
if *replaced_index < symbol.index {
adjusted_index -= 1;
}
}
return Symbol::non_terminal(adjusted_index);
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::grammars::VariableType;
#[test]
fn test_extraction() {
let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![
Variable::named(
"rule_0",
Rule::repeat(Rule::seq(vec![
Rule::string("a"),
Rule::pattern("b"),
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
Rule::token(Rule::repeat(Rule::choice(vec![
Rule::string("c"),
Rule::string("d"),
]))),
]),
])),
),
Variable::named("rule_1", Rule::pattern("e")),
Variable::named("rule_2", Rule::pattern("b")),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]),
),
]))
.unwrap();
assert_eq!(
syntax_grammar.variables,
vec![
Variable::named(
"rule_0",
Rule::repeat(Rule::seq(vec![
// The string "a" was replaced by a symbol referencing the lexical grammar
Rule::terminal(0),
// The pattern "b" was replaced by a symbol referencing the lexical grammar
Rule::terminal(1),
Rule::choice(vec![
// The symbol referencing `rule_1` was replaced by a symbol referencing
// the lexical grammar.
Rule::terminal(3),
// The symbol referencing `rule_2` had its index decremented because
// `rule_1` was moved to the lexical grammar.
Rule::non_terminal(1),
// The rule wrapped in `token` was replaced by a symbol referencing
// the lexical grammar.
Rule::terminal(2),
])
]))
),
// The pattern "e" was only used in once place: as the definition of `rule_1`,
// so that rule was moved to the lexical grammar. The pattern "b" appeared in
// two places, so it was not moved into the lexical grammar.
Variable::named("rule_2", Rule::terminal(1)),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(1), Rule::Blank,])
),
]
);
assert_eq!(
lexical_grammar.variables,
vec![
Variable::anonymous("a", Rule::string("a")),
Variable::auxiliary("rule_0_token1", Rule::pattern("b")),
Variable::auxiliary(
"rule_0_token2",
Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),]))
),
Variable::named("rule_1", Rule::pattern("e")),
]
);
}
#[test]
fn test_start_rule_is_token() {
let (syntax_grammar, lexical_grammar) =
extract_tokens(build_grammar(vec![Variable::named(
"rule_0",
Rule::string("hello"),
)]))
.unwrap();
assert_eq!(
syntax_grammar.variables,
vec![Variable::named("rule_0", Rule::terminal(0)),]
);
assert_eq!(
lexical_grammar.variables,
vec![Variable::anonymous("hello", Rule::string("hello")),]
)
}
#[test]
fn test_extracting_extra_tokens() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::string("x")),
Variable::named("comment", Rule::pattern("//.*")),
]);
grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)];
let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]);
assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
}
#[test]
fn test_extract_externals() {
let mut grammar = build_grammar(vec![
Variable::named(
"rule_0",
Rule::seq(vec![
Rule::external(0),
Rule::string("a"),
Rule::non_terminal(1),
Rule::non_terminal(2),
]),
),
Variable::named("rule_1", Rule::string("b")),
Variable::named("rule_2", Rule::string("c")),
]);
grammar.external_tokens = vec![
Variable::named("external_0", Rule::external(0)),
Variable::anonymous("a", Rule::string("a")),
Variable::named("rule_2", Rule::non_terminal(2)),
];
let (syntax_grammar, _) = extract_tokens(grammar).unwrap();
assert_eq!(
syntax_grammar.external_tokens,
vec![
ExternalToken {
name: "external_0".to_string(),
kind: VariableType::Named,
corresponding_internal_token: None,
},
ExternalToken {
name: "a".to_string(),
kind: VariableType::Anonymous,
corresponding_internal_token: Some(Symbol::terminal(0)),
},
ExternalToken {
name: "rule_2".to_string(),
kind: VariableType::Named,
corresponding_internal_token: Some(Symbol::terminal(2)),
},
]
);
}
#[test]
fn test_error_on_non_terminal_symbol_extras() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::named("rule_1", Rule::non_terminal(2)),
Variable::named("rule_2", Rule::string("x")),
]);
grammar.extra_tokens = vec![Rule::non_terminal(1)];
match extract_tokens(grammar) {
Err(Error(s)) => {
assert_eq!(
s,
"Non-token symbol 'rule_1' cannot be used as an extra token"
);
}
_ => {
panic!("Expected an error but got no error");
}
}
}
#[test]
fn test_error_on_external_with_same_name_as_non_terminal() {
let mut grammar = build_grammar(vec![
Variable::named(
"rule_0",
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
),
Variable::named(
"rule_1",
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2)]),
),
Variable::named("rule_2", Rule::string("a")),
]);
grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))];
match extract_tokens(grammar) {
Err(Error(s)) => {
assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule");
}
_ => {
panic!("Expected an error but got no error");
}
}
}
fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
InternedGrammar {
variables,
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
}
}
}

View file

@ -0,0 +1,313 @@
use super::ExtractedSyntaxGrammar;
use crate::error::Result;
use crate::grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable};
use crate::rules::{Alias, Associativity, Rule};
struct RuleFlattener {
production: Production,
precedence_stack: Vec<i32>,
associativity_stack: Vec<Associativity>,
alias_stack: Vec<Alias>,
}
impl RuleFlattener {
fn new() -> Self {
Self {
production: Production {
steps: Vec::new(),
dynamic_precedence: 0,
},
precedence_stack: Vec::new(),
associativity_stack: Vec::new(),
alias_stack: Vec::new(),
}
}
fn flatten(mut self, rule: Rule) -> Production {
self.apply(rule, true);
self.production
}
fn apply(&mut self, rule: Rule, at_end: bool) {
match rule {
Rule::Seq(members) => {
let last_index = members.len() - 1;
for (i, member) in members.into_iter().enumerate() {
self.apply(member, i == last_index && at_end);
}
}
Rule::Metadata { rule, params } => {
let mut has_precedence = false;
if let Some(precedence) = params.precedence {
has_precedence = true;
self.precedence_stack.push(precedence);
}
let mut has_associativity = false;
if let Some(associativity) = params.associativity {
has_associativity = true;
self.associativity_stack.push(associativity);
}
let mut has_alias = false;
if let Some(alias) = params.alias {
has_alias = true;
self.alias_stack.push(alias);
}
if params.dynamic_precedence.abs() > self.production.dynamic_precedence.abs() {
self.production.dynamic_precedence = params.dynamic_precedence;
}
self.apply(*rule, at_end);
if has_precedence {
self.precedence_stack.pop();
if !at_end {
self.production.steps.last_mut().unwrap().precedence =
self.precedence_stack.last().cloned().unwrap_or(0);
}
}
if has_associativity {
self.associativity_stack.pop();
if !at_end {
self.production.steps.last_mut().unwrap().associativity =
self.associativity_stack.last().cloned();
}
}
if has_alias {
self.alias_stack.pop();
}
}
Rule::Symbol(symbol) => {
self.production.steps.push(ProductionStep {
symbol,
precedence: self.precedence_stack.last().cloned().unwrap_or(0),
associativity: self.associativity_stack.last().cloned(),
alias: self.alias_stack.last().cloned(),
});
}
_ => (),
}
}
}
fn extract_choices(rule: Rule) -> Vec<Rule> {
match rule {
Rule::Seq(elements) => {
let mut result = vec![Rule::Blank];
for element in elements {
let extraction = extract_choices(element);
let mut next_result = Vec::new();
for entry in result {
for extraction_entry in extraction.iter() {
next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()]));
}
}
result = next_result;
}
result
}
Rule::Choice(elements) => {
let mut result = Vec::new();
for element in elements {
for rule in extract_choices(element) {
result.push(rule);
}
}
result
}
Rule::Metadata { rule, params } => extract_choices(*rule)
.into_iter()
.map(|rule| Rule::Metadata {
rule: Box::new(rule),
params: params.clone(),
})
.collect(),
_ => vec![rule],
}
}
fn flatten_variable(variable: Variable) -> Result<SyntaxVariable> {
let mut productions = Vec::new();
for rule in extract_choices(variable.rule) {
let production = RuleFlattener::new().flatten(rule);
if !productions.contains(&production) {
productions.push(production);
}
}
Ok(SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
})
}
pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxGrammar> {
let mut variables = Vec::new();
for variable in grammar.variables {
variables.push(flatten_variable(variable)?);
}
Ok(SyntaxGrammar {
extra_tokens: grammar.extra_tokens,
expected_conflicts: grammar.expected_conflicts,
variables_to_inline: grammar.variables_to_inline,
external_tokens: grammar.external_tokens,
word_token: grammar.word_token,
variables,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::VariableType;
use crate::rules::Symbol;
#[test]
fn test_flatten_grammar() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(1),
Rule::prec_left(
101,
Rule::seq(vec![
Rule::non_terminal(2),
Rule::choice(vec![
Rule::prec_right(
102,
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]),
),
Rule::non_terminal(7),
]),
})
.unwrap();
assert_eq!(
result.productions,
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(3))
.with_prec(102, Some(Associativity::Right)),
ProductionStep::new(Symbol::non_terminal(4))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
]
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(5))
.with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
]
},
]
);
}
#[test]
fn test_flatten_grammar_with_maximum_dynamic_precedence() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(1),
Rule::prec_dynamic(101, Rule::seq(vec![
Rule::non_terminal(2),
Rule::choice(vec![
Rule::prec_dynamic(102, Rule::seq(vec![
Rule::non_terminal(3),
Rule::non_terminal(4)
])),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
])),
Rule::non_terminal(7),
])
}).unwrap();
assert_eq!(result.productions, vec![
Production {
dynamic_precedence: 102,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::non_terminal(3)),
ProductionStep::new(Symbol::non_terminal(4)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
],
},
Production {
dynamic_precedence: 101,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::non_terminal(5)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
],
},
]);
}
#[test]
fn test_flatten_grammar_with_final_precedence() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::prec_left(101, Rule::seq(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
])),
}).unwrap();
assert_eq!(result.productions, vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(2)).with_prec(101, Some(Associativity::Left)),
]
}
]);
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::prec_left(101, Rule::seq(vec![
Rule::non_terminal(1),
])),
}).unwrap();
assert_eq!(result.productions, vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)).with_prec(101, Some(Associativity::Left)),
]
}
]);
}
}

View file

@ -0,0 +1,238 @@
use super::InternedGrammar;
use crate::error::{Error, Result};
use crate::grammars::{InputGrammar, Variable, VariableType};
use crate::rules::{Rule, Symbol};
pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar> {
let interner = Interner { grammar };
if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden {
return Err(Error(
"Grammar's start rule must be visible".to_string(),
));
}
let mut variables = Vec::with_capacity(grammar.variables.len());
for variable in grammar.variables.iter() {
variables.push(Variable {
name: variable.name.clone(),
kind: variable_type_for_name(&variable.name),
rule: interner.intern_rule(&variable.rule)?,
});
}
let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len());
for external_token in grammar.external_tokens.iter() {
let rule = interner.intern_rule(&external_token)?;
let (name, kind) = if let Rule::NamedSymbol(name) = external_token {
(name.clone(), variable_type_for_name(&name))
} else {
(String::new(), VariableType::Anonymous)
};
external_tokens.push(Variable { name, kind, rule });
}
let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len());
for extra_token in grammar.extra_tokens.iter() {
extra_tokens.push(interner.intern_rule(extra_token)?);
}
let mut expected_conflicts = Vec::new();
for conflict in grammar.expected_conflicts.iter() {
let mut interned_conflict = Vec::with_capacity(conflict.len());
for name in conflict {
interned_conflict.push(
interner
.intern_name(&name)
.ok_or_else(|| Error::undefined_symbol(name))?,
);
}
expected_conflicts.push(interned_conflict);
}
let mut variables_to_inline = Vec::new();
for name in grammar.variables_to_inline.iter() {
if let Some(symbol) = interner.intern_name(&name) {
variables_to_inline.push(symbol);
}
}
let mut word_token = None;
if let Some(name) = grammar.word_token.as_ref() {
word_token = Some(
interner
.intern_name(&name)
.ok_or_else(|| Error::undefined_symbol(&name))?,
);
}
Ok(InternedGrammar {
variables,
external_tokens,
extra_tokens,
expected_conflicts,
variables_to_inline,
word_token,
})
}
struct Interner<'a> {
grammar: &'a InputGrammar,
}
impl<'a> Interner<'a> {
fn intern_rule(&self, rule: &Rule) -> Result<Rule> {
match rule {
Rule::Choice(elements) => {
let mut result = Vec::with_capacity(elements.len());
for element in elements {
result.push(self.intern_rule(element)?);
}
Ok(Rule::Choice(result))
}
Rule::Seq(elements) => {
let mut result = Vec::with_capacity(elements.len());
for element in elements {
result.push(self.intern_rule(element)?);
}
Ok(Rule::Seq(result))
}
Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))),
Rule::Metadata { rule, params } => Ok(Rule::Metadata {
rule: Box::new(self.intern_rule(rule)?),
params: params.clone(),
}),
Rule::NamedSymbol(name) => {
if let Some(symbol) = self.intern_name(&name) {
Ok(Rule::Symbol(symbol))
} else {
Err(Error::undefined_symbol(name))
}
}
_ => Ok(rule.clone()),
}
}
fn intern_name(&self, symbol: &str) -> Option<Symbol> {
for (i, variable) in self.grammar.variables.iter().enumerate() {
if variable.name == symbol {
return Some(Symbol::non_terminal(i));
}
}
for (i, external_token) in self.grammar.external_tokens.iter().enumerate() {
if let Rule::NamedSymbol(name) = external_token {
if name == symbol {
return Some(Symbol::external(i));
}
}
}
return None;
}
}
fn variable_type_for_name(name: &str) -> VariableType {
if name.starts_with("_") {
VariableType::Hidden
} else {
VariableType::Named
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_repeat_expansion() {
let grammar = intern_symbols(&build_grammar(vec![
Variable::named("x", Rule::choice(vec![Rule::named("y"), Rule::named("_z")])),
Variable::named("y", Rule::named("_z")),
Variable::named("_z", Rule::string("a")),
]))
.unwrap();
assert_eq!(
grammar.variables,
vec![
Variable::named(
"x",
Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),])
),
Variable::named("y", Rule::non_terminal(2)),
Variable::hidden("_z", Rule::string("a")),
]
);
}
#[test]
fn test_interning_external_token_names() {
// Variable `y` is both an internal and an external token.
// Variable `z` is just an external token.
let mut input_grammar = build_grammar(vec![
Variable::named(
"w",
Rule::choice(vec![Rule::named("x"), Rule::named("y"), Rule::named("z")]),
),
Variable::named("x", Rule::string("a")),
Variable::named("y", Rule::string("b")),
]);
input_grammar
.external_tokens
.extend(vec![Rule::named("y"), Rule::named("z")]);
let grammar = intern_symbols(&input_grammar).unwrap();
// Variable `y` is referred to by its internal index.
// Variable `z` is referred to by its external index.
assert_eq!(
grammar.variables,
vec![
Variable::named(
"w",
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
Rule::external(1),
])
),
Variable::named("x", Rule::string("a")),
Variable::named("y", Rule::string("b")),
]
);
// The external token for `y` refers back to its internal index.
assert_eq!(
grammar.external_tokens,
vec![
Variable::named("y", Rule::non_terminal(2)),
Variable::named("z", Rule::external(1)),
]
);
}
#[test]
fn test_grammar_with_undefined_symbols() {
let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))]));
match result {
Err(Error(message)) => assert_eq!(message, "Undefined symbol 'y'"),
_ => panic!("Expected an error but got none"),
}
}
fn build_grammar(variables: Vec<Variable>) -> InputGrammar {
InputGrammar {
variables,
name: "the_language".to_string(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
}
}
}

View file

@ -0,0 +1,57 @@
mod expand_repeats;
mod expand_tokens;
mod extract_simple_aliases;
mod extract_tokens;
mod flatten_grammar;
mod intern_symbols;
mod process_inlines;
use self::expand_repeats::expand_repeats;
pub(crate) use self::expand_tokens::expand_tokens;
use self::extract_simple_aliases::extract_simple_aliases;
use self::extract_tokens::extract_tokens;
use self::flatten_grammar::flatten_grammar;
use self::intern_symbols::intern_symbols;
use self::process_inlines::process_inlines;
use crate::error::Result;
use crate::grammars::{
ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar, Variable,
};
use crate::rules::{AliasMap, Rule, Symbol};
pub(crate) struct IntermediateGrammar<T, U> {
variables: Vec<Variable>,
extra_tokens: Vec<T>,
expected_conflicts: Vec<Vec<Symbol>>,
external_tokens: Vec<U>,
variables_to_inline: Vec<Symbol>,
word_token: Option<Symbol>,
}
pub(crate) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ExtractedLexicalGrammar {
pub variables: Vec<Variable>,
pub separators: Vec<Rule>,
}
pub(crate) fn prepare_grammar(
input_grammar: &InputGrammar,
) -> Result<(
SyntaxGrammar,
LexicalGrammar,
InlinedProductionMap,
AliasMap,
)> {
let interned_grammar = intern_symbols(input_grammar)?;
let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
let syntax_grammar = expand_repeats(syntax_grammar);
let mut syntax_grammar = flatten_grammar(syntax_grammar)?;
let lexical_grammar = expand_tokens(lexical_grammar)?;
let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar);
let inlines = process_inlines(&syntax_grammar);
Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases))
}

View file

@ -0,0 +1,479 @@
use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar};
use hashbrown::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
struct ProductionStepId {
// A `None` value here means that the production itself was produced via inlining,
// and is stored in the the builder's `productions` vector, as opposed to being
// stored in one of the grammar's variables.
variable_index: Option<usize>,
production_index: usize,
step_index: usize,
}
struct InlinedProductionMapBuilder {
production_indices_by_step_id: HashMap<ProductionStepId, Vec<usize>>,
productions: Vec<Production>,
}
impl InlinedProductionMapBuilder {
fn build<'a>(mut self, grammar: &'a SyntaxGrammar) -> InlinedProductionMap {
let mut step_ids_to_process = Vec::new();
for (variable_index, variable) in grammar.variables.iter().enumerate() {
for production_index in 0..variable.productions.len() {
step_ids_to_process.push(ProductionStepId {
variable_index: Some(variable_index),
production_index,
step_index: 0,
});
while !step_ids_to_process.is_empty() {
let mut i = 0;
while i < step_ids_to_process.len() {
let step_id = step_ids_to_process[i];
if let Some(step) = self.production_step_for_id(step_id, grammar) {
if grammar.variables_to_inline.contains(&step.symbol) {
let inlined_step_ids = self
.inline_production_at_step(step_id, grammar)
.into_iter()
.cloned()
.map(|production_index| ProductionStepId {
variable_index: None,
production_index,
step_index: step_id.step_index,
});
step_ids_to_process.splice(i..i + 1, inlined_step_ids);
} else {
step_ids_to_process[i] = ProductionStepId {
variable_index: step_id.variable_index,
production_index: step_id.production_index,
step_index: step_id.step_index + 1,
};
i += 1;
}
} else {
step_ids_to_process.remove(i);
}
}
}
}
}
let productions = self.productions;
let production_indices_by_step_id = self.production_indices_by_step_id;
let production_map = production_indices_by_step_id
.into_iter()
.map(|(step_id, production_indices)| {
let production = if let Some(variable_index) = step_id.variable_index {
&grammar.variables[variable_index].productions[step_id.production_index]
} else {
&productions[step_id.production_index]
} as *const Production;
((production, step_id.step_index as u32), production_indices)
})
.collect();
InlinedProductionMap {
productions,
production_map,
}
}
fn inline_production_at_step<'a>(
&'a mut self,
step_id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> &'a Vec<usize> {
// Build a list of productions produced by inlining rules.
let mut i = 0;
let step_index = step_id.step_index;
let mut productions_to_add = vec![self.production_for_id(step_id, grammar).clone()];
while i < productions_to_add.len() {
if let Some(step) = productions_to_add[i].steps.get(step_index) {
let symbol = step.symbol.clone();
if grammar.variables_to_inline.contains(&symbol) {
// Remove the production from the vector, replacing it with a placeholder.
let production = productions_to_add
.splice(i..i + 1, [Production::default()].iter().cloned())
.next()
.unwrap();
// Replace the placeholder with the inlined productions.
productions_to_add.splice(
i..i + 1,
grammar.variables[symbol.index].productions.iter().map(|p| {
let mut production = production.clone();
let removed_step = production
.steps
.splice(step_index..(step_index + 1), p.steps.iter().cloned())
.next()
.unwrap();
let inserted_steps =
&mut production.steps[step_index..(step_index + p.steps.len())];
if let Some(alias) = removed_step.alias {
for inserted_step in inserted_steps.iter_mut() {
inserted_step.alias = Some(alias.clone());
}
}
if let Some(last_inserted_step) = inserted_steps.last_mut() {
if last_inserted_step.precedence == 0 {
last_inserted_step.precedence = removed_step.precedence;
}
if last_inserted_step.associativity == None {
last_inserted_step.associativity = removed_step.associativity;
}
}
production
}),
);
continue;
}
}
i += 1;
}
// Store all the computed productions.
let result = productions_to_add
.into_iter()
.map(|production| {
self.productions
.iter()
.position(|p| *p == production)
.unwrap_or({
self.productions.push(production);
self.productions.len() - 1
})
})
.collect();
// Cache these productions based on the original production step.
self.production_indices_by_step_id
.entry(step_id)
.or_insert(result)
}
fn production_for_id<'a>(
&'a self,
id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> &'a Production {
if let Some(variable_index) = id.variable_index {
&grammar.variables[variable_index].productions[id.production_index]
} else {
&self.productions[id.production_index]
}
}
fn production_step_for_id<'a>(
&'a self,
id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> Option<&'a ProductionStep> {
self.production_for_id(id, grammar).steps.get(id.step_index)
}
}
pub(super) fn process_inlines(grammar: &SyntaxGrammar) -> InlinedProductionMap {
InlinedProductionMapBuilder {
productions: Vec::new(),
production_indices_by_step_id: HashMap::new(),
}
.build(grammar)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::grammars::{ProductionStep, SyntaxVariable, VariableType};
use crate::rules::{Associativity, Symbol};
#[test]
fn test_basic_inlining() {
let grammar = SyntaxGrammar {
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
variables_to_inline: vec![Symbol::non_terminal(1)],
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(1)), // inlined
ProductionStep::new(Symbol::terminal(11)),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(12)),
ProductionStep::new(Symbol::terminal(13)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(14))],
},
],
},
],
};
let inline_map = process_inlines(&grammar);
// Nothing to inline at step 0.
assert!(inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0)
.is_none());
// Inlining variable 1 yields two productions.
assert_eq!(
inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(12)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(14)),
ProductionStep::new(Symbol::terminal(11)),
],
},
]
);
}
#[test]
fn test_nested_inlining() {
let grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(1)), // inlined
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)), // inlined
ProductionStep::new(Symbol::terminal(12)),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(13))],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(3)), // inlined
ProductionStep::new(Symbol::terminal(14)),
],
},
],
},
SyntaxVariable {
name: "non-terminal-2".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(15))],
}],
},
SyntaxVariable {
name: "non-terminal-3".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(16))],
}],
},
],
variables_to_inline: vec![
Symbol::non_terminal(1),
Symbol::non_terminal(2),
Symbol::non_terminal(3),
],
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let inline_map = process_inlines(&grammar);
let productions: Vec<&Production> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap()
.collect();
assert_eq!(
productions.iter().cloned().cloned().collect::<Vec<_>>(),
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::terminal(12)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(16)),
ProductionStep::new(Symbol::terminal(14)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::terminal(12)),
],
},
]
);
assert_eq!(
inline_map
.inlined_productions(productions[0], 3)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::terminal(15)),
ProductionStep::new(Symbol::terminal(12)),
],
},]
);
}
#[test]
fn test_inlining_with_precedence_and_alias() {
let grammar = SyntaxGrammar {
variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)],
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// inlined
ProductionStep::new(Symbol::non_terminal(1))
.with_prec(1, Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
// inlined
ProductionStep::new(Symbol::non_terminal(2))
.with_alias("outer_alias", true),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(11))
.with_prec(2, None)
.with_alias("inner_alias", true),
ProductionStep::new(Symbol::terminal(12)).with_prec(3, None),
],
}],
},
SyntaxVariable {
name: "non-terminal-2".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(13))],
}],
},
],
expected_conflicts: Vec::new(),
extra_tokens: Vec::new(),
external_tokens: Vec::new(),
word_token: None,
};
let inline_map = process_inlines(&grammar);
let productions: Vec<_> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0)
.unwrap()
.collect();
assert_eq!(
productions.iter().cloned().cloned().collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
// The first step in the inlined production retains its precedence
// and alias.
ProductionStep::new(Symbol::terminal(11))
.with_prec(2, None)
.with_alias("inner_alias", true),
// The final step of the inlined production inherits the precedence of
// the inlined step.
ProductionStep::new(Symbol::terminal(12))
.with_prec(1, Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(2)).with_alias("outer_alias", true),
]
}],
);
assert_eq!(
inline_map
.inlined_productions(productions[0], 3)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(11))
.with_prec(2, None)
.with_alias("inner_alias", true),
ProductionStep::new(Symbol::terminal(12))
.with_prec(1, Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
// All steps of the inlined production inherit their alias from the
// inlined step.
ProductionStep::new(Symbol::terminal(13)).with_alias("outer_alias", true),
]
}],
);
}
}

1034
cli/src/render/mod.rs Normal file

File diff suppressed because it is too large Load diff

234
cli/src/rules.rs Normal file
View file

@ -0,0 +1,234 @@
use hashbrown::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum SymbolType {
External,
End,
Terminal,
NonTerminal,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum Associativity {
Left,
Right,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Alias {
pub value: String,
pub is_named: bool,
}
pub(crate) type AliasMap = HashMap<Symbol, Alias>;
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
pub(crate) struct MetadataParams {
pub precedence: Option<i32>,
pub dynamic_precedence: i32,
pub associativity: Option<Associativity>,
pub is_token: bool,
pub is_string: bool,
pub is_active: bool,
pub is_main_token: bool,
pub alias: Option<Alias>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Symbol {
pub kind: SymbolType,
pub index: usize,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) enum Rule {
Blank,
String(String),
Pattern(String),
NamedSymbol(String),
Symbol(Symbol),
Choice(Vec<Rule>),
Metadata {
params: MetadataParams,
rule: Box<Rule>,
},
Repeat(Box<Rule>),
Seq(Vec<Rule>),
}
impl Rule {
pub fn alias(content: Rule, value: String, is_named: bool) -> Self {
add_metadata(content, move |params| {
params.alias = Some(Alias {
is_named,
value
});
})
}
pub fn token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
})
}
pub fn immediate_token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
params.is_main_token = true;
})
}
pub fn prec(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.precedence = Some(value);
})
}
pub fn prec_left(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.associativity = Some(Associativity::Left);
params.precedence = Some(value);
})
}
pub fn prec_right(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.associativity = Some(Associativity::Right);
params.precedence = Some(value);
})
}
pub fn prec_dynamic(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.dynamic_precedence = value;
})
}
pub fn repeat(rule: Rule) -> Self {
Rule::Repeat(Box::new(rule))
}
pub fn choice(rules: Vec<Rule>) -> Self {
let mut elements = Vec::with_capacity(rules.len());
for rule in rules {
choice_helper(&mut elements, rule);
}
Rule::Choice(elements)
}
pub fn seq(rules: Vec<Rule>) -> Self {
Rule::Seq(rules)
}
}
#[cfg(test)]
impl Rule {
pub fn terminal(index: usize) -> Self {
Rule::Symbol(Symbol::terminal(index))
}
pub fn non_terminal(index: usize) -> Self {
Rule::Symbol(Symbol::non_terminal(index))
}
pub fn external(index: usize) -> Self {
Rule::Symbol(Symbol::external(index))
}
pub fn named(name: &'static str) -> Self {
Rule::NamedSymbol(name.to_string())
}
pub fn string(value: &'static str) -> Self {
Rule::String(value.to_string())
}
pub fn pattern(value: &'static str) -> Self {
Rule::Pattern(value.to_string())
}
}
impl Symbol {
pub fn is_terminal(&self) -> bool {
self.kind == SymbolType::Terminal
}
pub fn is_non_terminal(&self) -> bool {
self.kind == SymbolType::NonTerminal
}
pub fn is_external(&self) -> bool {
self.kind == SymbolType::External
}
pub fn is_eof(&self) -> bool {
self.kind == SymbolType::End
}
pub fn non_terminal(index: usize) -> Self {
Symbol {
kind: SymbolType::NonTerminal,
index,
}
}
pub fn terminal(index: usize) -> Self {
Symbol {
kind: SymbolType::Terminal,
index,
}
}
pub fn external(index: usize) -> Self {
Symbol {
kind: SymbolType::External,
index,
}
}
pub fn end() -> Self {
Symbol {
kind: SymbolType::End,
index: 0,
}
}
}
impl From<Symbol> for Rule {
fn from(symbol: Symbol) -> Self {
Rule::Symbol(symbol)
}
}
fn add_metadata<T: FnOnce(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
match input {
Rule::Metadata { rule, mut params } => {
f(&mut params);
Rule::Metadata { rule, params }
}
_ => {
let mut params = MetadataParams::default();
f(&mut params);
Rule::Metadata {
rule: Box::new(input),
params,
}
}
}
}
fn choice_helper(result: &mut Vec<Rule>, rule: Rule) {
match rule {
Rule::Choice(elements) => {
for element in elements {
choice_helper(result, element);
}
}
_ => {
if !result.contains(&rule) {
result.push(rule);
}
}
}
}

140
cli/src/tables.rs Normal file
View file

@ -0,0 +1,140 @@
use crate::nfa::CharacterSet;
use crate::rules::{Alias, Associativity, Symbol};
use hashbrown::HashMap;
pub(crate) type AliasSequenceId = usize;
pub(crate) type ParseStateId = usize;
pub(crate) type LexStateId = usize;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum ParseAction {
Accept,
Shift {
state: ParseStateId,
is_repetition: bool,
},
ShiftExtra,
Recover,
Reduce {
symbol: Symbol,
child_count: usize,
precedence: i32,
dynamic_precedence: i32,
associativity: Option<Associativity>,
alias_sequence_id: AliasSequenceId,
},
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseTableEntry {
pub actions: Vec<ParseAction>,
pub reusable: bool,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseState {
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
pub lex_state_id: usize,
pub unfinished_item_signature: u64,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ParseTable {
pub states: Vec<ParseState>,
pub symbols: Vec<Symbol>,
pub alias_sequences: Vec<Vec<Option<Alias>>>,
pub max_aliased_production_length: usize,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct AdvanceAction {
pub state: Option<LexStateId>,
pub in_main_token: bool,
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub(crate) struct LexState {
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
pub accept_action: Option<Symbol>,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexTable {
pub states: Vec<LexState>,
}
impl ParseTableEntry {
pub fn new() -> Self {
Self {
reusable: true,
actions: Vec::new(),
}
}
}
impl Default for LexTable {
fn default() -> Self {
LexTable { states: Vec::new() }
}
}
impl ParseState {
pub fn referenced_states<'a>(&'a self) -> impl Iterator<Item = ParseStateId> + 'a {
self.terminal_entries
.iter()
.flat_map(|(_, entry)| {
entry.actions.iter().filter_map(|action| match action {
ParseAction::Shift { state, .. } => Some(*state),
_ => None,
})
})
.chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
}
pub fn update_referenced_states<F>(&mut self, mut f: F)
where
F: FnMut(usize, &ParseState) -> usize,
{
let mut updates = Vec::new();
for (symbol, entry) in &self.terminal_entries {
for (i, action) in entry.actions.iter().enumerate() {
if let ParseAction::Shift { state, .. } = action {
let result = f(*state, self);
if result != *state {
updates.push((*symbol, i, result));
}
}
}
}
for (symbol, other_state) in &self.nonterminal_entries {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
}
}
for (symbol, action_index, new_state) in updates {
if symbol.is_non_terminal() {
self.nonterminal_entries.insert(symbol, new_state);
} else {
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {
entry.actions[action_index] = ParseAction::Shift {
state: new_state,
is_repetition,
};
}
}
}
}
}
impl ParseAction {
pub fn precedence(&self) -> i32 {
if let ParseAction::Reduce { precedence, .. } = self {
*precedence
} else {
0
}
}
}

1
externals/bandit vendored

@ -1 +0,0 @@
Subproject commit bfdb8a3322a2e54b11aea64d84f9788d83477e83

@ -1 +0,0 @@
Subproject commit c7e5c23ab04ecfb5465cbefbe17ba23d4cb3bc9d

1
externals/gyp vendored

@ -1 +0,0 @@
Subproject commit e0ee72ddc7fb97eb33d530cf684efcbe4d27ecb3

@ -1 +0,0 @@
Subproject commit 70533215eea575e40a0b91a34ae01a779641d73a

32
lib/Cargo.toml Normal file
View file

@ -0,0 +1,32 @@
[package]
name = "tree-sitter"
description = "Rust bindings to the Tree-sitter parsing library"
version = "0.3.5"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing"]
categories = ["api-bindings", "parsing", "text-editors"]
include = [
"/build.rs",
"/Cargo.toml",
"/LICENSE",
"/README.md",
"/src/*",
"/core/tree-sitter/externals/utf8proc/utf8proc*",
"/core/tree-sitter/include/*",
"/core/tree-sitter/src/runtime/*",
]
[dependencies]
regex = "1"
serde = "1.0"
serde_json = "1.0"
serde_derive = "1.0"
[build-dependencies]
cc = "1.0"
[lib]
path = "binding/lib.rs"

98
lib/README.md Normal file
View file

@ -0,0 +1,98 @@
Rust Tree-sitter
===========================
[![Build Status](https://travis-ci.org/tree-sitter/rust-tree-sitter.svg)](https://travis-ci.org/tree-sitter/rust-tree-sitter)
[![Build status](https://ci.appveyor.com/api/projects/status/d0f6vqq3rflxx3y6/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/rust-tree-sitter/branch/master)
[![Crates.io](https://img.shields.io/crates/v/tree-sitter.svg)](https://crates.io/crates/tree-sitter)
Rust bindings to the [Tree-sitter][] parsing library.
### Basic Usage
First, create a parser:
```rust
use tree_sitter::{Parser, Language};
// ...
let mut parser = Parser::new();
```
Then assign a language to the parser. Tree-sitter languages consist of generated C code. To use them from rust, you must declare them as `extern "C"` functions and invoke them with `unsafe`:
```rust
extern "C" { fn tree_sitter_c() -> Language; }
extern "C" { fn tree_sitter_rust() -> Language; }
extern "C" { fn tree_sitter_javascript() -> Language; }
let language = unsafe { tree_sitter_rust() };
parser.set_language(language).unwrap();
```
Now you can parse source code:
```rust
let source_code = "fn test() {}";
let tree = parser.parse_str(source_code, None);
let root_node = tree.root_node();
assert_eq!(root_node.kind(), "source_file");
assert_eq!(root_node.start_position().column, 0);
assert_eq!(root_node.end_position().column, 12);
```
### Editing
Once you have a syntax tree, you can update it when your source code changes. Passing in the previous edited tree makes `parse` run much more quickly:
```rust
let new_source_code = "fn test(a: u32) {}"
tree.edit(InputEdit {
start_byte: 8,
old_end_byte: 8,
new_end_byte: 14,
start_position: Point::new(0, 8),
old_end_position: Point::new(0, 8),
new_end_position: Point::new(0, 14),
});
let new_tree = parser.parse_str(new_source_code, Some(&tree));
```
### Text Input
The source code to parse can be provided either as a string or as a function that returns text encoded as either UTF8 or UTF16:
```rust
// Store some source code in an array of lines.
let lines = &[
"pub fn foo() {",
" 1",
"}",
];
// Parse the source code using a custom callback. The callback is called
// with both a byte offset and a row/column offset.
let tree = parser.parse_utf8(&mut |_byte: u32, position: Point| -> &[u8] {
let row = position.row as usize;
let column = position.column as usize;
if row < lines.len() {
if column < lines[row].as_bytes().len() {
&lines[row].as_bytes()[column..]
} else {
"\n".as_bytes()
}
} else {
&[]
}
}, None).unwrap();
assert_eq!(
tree.root_node().to_sexp(),
"(source_file (function_item (visibility_modifier) (identifier) (parameters) (block (number_literal))))"
);
```
[tree-sitter]: https://github.com/tree-sitter/tree-sitter

310
lib/binding/bindings.rs Normal file
View file

@ -0,0 +1,310 @@
/* automatically generated by rust-bindgen */
pub type FILE = [u64; 19usize];
pub type TSSymbol = u16;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSLanguage {
_unused: [u8; 0],
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSParser {
_unused: [u8; 0],
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSTree {
_unused: [u8; 0],
}
pub const TSInputEncoding_TSInputEncodingUTF8: TSInputEncoding = 0;
pub const TSInputEncoding_TSInputEncodingUTF16: TSInputEncoding = 1;
pub type TSInputEncoding = u32;
pub const TSSymbolType_TSSymbolTypeRegular: TSSymbolType = 0;
pub const TSSymbolType_TSSymbolTypeAnonymous: TSSymbolType = 1;
pub const TSSymbolType_TSSymbolTypeAuxiliary: TSSymbolType = 2;
pub type TSSymbolType = u32;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSPoint {
pub row: u32,
pub column: u32,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSRange {
pub start_point: TSPoint,
pub end_point: TSPoint,
pub start_byte: u32,
pub end_byte: u32,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSInput {
pub payload: *mut ::std::os::raw::c_void,
pub read: ::std::option::Option<
unsafe extern "C" fn(
payload: *mut ::std::os::raw::c_void,
byte_index: u32,
position: TSPoint,
bytes_read: *mut u32,
) -> *const ::std::os::raw::c_char,
>,
pub encoding: TSInputEncoding,
}
pub const TSLogType_TSLogTypeParse: TSLogType = 0;
pub const TSLogType_TSLogTypeLex: TSLogType = 1;
pub type TSLogType = u32;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSLogger {
pub payload: *mut ::std::os::raw::c_void,
pub log: ::std::option::Option<
unsafe extern "C" fn(
payload: *mut ::std::os::raw::c_void,
arg1: TSLogType,
arg2: *const ::std::os::raw::c_char,
),
>,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSInputEdit {
pub start_byte: u32,
pub old_end_byte: u32,
pub new_end_byte: u32,
pub start_point: TSPoint,
pub old_end_point: TSPoint,
pub new_end_point: TSPoint,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSNode {
pub context: [u32; 4usize],
pub id: *const ::std::os::raw::c_void,
pub tree: *const TSTree,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct TSTreeCursor {
pub context: [u32; 2usize],
pub id: *const ::std::os::raw::c_void,
pub tree: *const ::std::os::raw::c_void,
}
extern "C" {
pub fn ts_parser_new() -> *mut TSParser;
}
extern "C" {
pub fn ts_parser_delete(arg1: *mut TSParser);
}
extern "C" {
pub fn ts_parser_language(arg1: *const TSParser) -> *const TSLanguage;
}
extern "C" {
pub fn ts_parser_set_language(arg1: *mut TSParser, arg2: *const TSLanguage) -> bool;
}
extern "C" {
pub fn ts_parser_logger(arg1: *const TSParser) -> TSLogger;
}
extern "C" {
pub fn ts_parser_set_logger(arg1: *mut TSParser, arg2: TSLogger);
}
extern "C" {
pub fn ts_parser_print_dot_graphs(arg1: *mut TSParser, arg2: *mut FILE);
}
extern "C" {
pub fn ts_parser_halt_on_error(arg1: *mut TSParser, arg2: bool);
}
extern "C" {
pub fn ts_parser_parse(arg1: *mut TSParser, arg2: *const TSTree, arg3: TSInput) -> *mut TSTree;
}
extern "C" {
pub fn ts_parser_parse_string(
arg1: *mut TSParser,
arg2: *const TSTree,
arg3: *const ::std::os::raw::c_char,
arg4: u32,
) -> *mut TSTree;
}
extern "C" {
pub fn ts_parser_enabled(arg1: *const TSParser) -> bool;
}
extern "C" {
pub fn ts_parser_set_enabled(arg1: *mut TSParser, arg2: bool);
}
extern "C" {
pub fn ts_parser_operation_limit(arg1: *const TSParser) -> usize;
}
extern "C" {
pub fn ts_parser_set_operation_limit(arg1: *mut TSParser, arg2: usize);
}
extern "C" {
pub fn ts_parser_reset(arg1: *mut TSParser);
}
extern "C" {
pub fn ts_parser_set_included_ranges(arg1: *mut TSParser, arg2: *const TSRange, arg3: u32);
}
extern "C" {
pub fn ts_parser_included_ranges(arg1: *const TSParser, arg2: *mut u32) -> *const TSRange;
}
extern "C" {
pub fn ts_tree_copy(arg1: *const TSTree) -> *mut TSTree;
}
extern "C" {
pub fn ts_tree_delete(arg1: *mut TSTree);
}
extern "C" {
pub fn ts_tree_root_node(arg1: *const TSTree) -> TSNode;
}
extern "C" {
pub fn ts_tree_edit(arg1: *mut TSTree, arg2: *const TSInputEdit);
}
extern "C" {
pub fn ts_tree_get_changed_ranges(
arg1: *const TSTree,
arg2: *const TSTree,
arg3: *mut u32,
) -> *mut TSRange;
}
extern "C" {
pub fn ts_tree_print_dot_graph(arg1: *const TSTree, arg2: *mut FILE);
}
extern "C" {
pub fn ts_tree_language(arg1: *const TSTree) -> *const TSLanguage;
}
extern "C" {
pub fn ts_node_start_byte(arg1: TSNode) -> u32;
}
extern "C" {
pub fn ts_node_start_point(arg1: TSNode) -> TSPoint;
}
extern "C" {
pub fn ts_node_end_byte(arg1: TSNode) -> u32;
}
extern "C" {
pub fn ts_node_end_point(arg1: TSNode) -> TSPoint;
}
extern "C" {
pub fn ts_node_symbol(arg1: TSNode) -> TSSymbol;
}
extern "C" {
pub fn ts_node_type(arg1: TSNode) -> *const ::std::os::raw::c_char;
}
extern "C" {
pub fn ts_node_string(arg1: TSNode) -> *mut ::std::os::raw::c_char;
}
extern "C" {
pub fn ts_node_eq(arg1: TSNode, arg2: TSNode) -> bool;
}
extern "C" {
pub fn ts_node_is_null(arg1: TSNode) -> bool;
}
extern "C" {
pub fn ts_node_is_named(arg1: TSNode) -> bool;
}
extern "C" {
pub fn ts_node_is_missing(arg1: TSNode) -> bool;
}
extern "C" {
pub fn ts_node_has_changes(arg1: TSNode) -> bool;
}
extern "C" {
pub fn ts_node_has_error(arg1: TSNode) -> bool;
}
extern "C" {
pub fn ts_node_parent(arg1: TSNode) -> TSNode;
}
extern "C" {
pub fn ts_node_child(arg1: TSNode, arg2: u32) -> TSNode;
}
extern "C" {
pub fn ts_node_named_child(arg1: TSNode, arg2: u32) -> TSNode;
}
extern "C" {
pub fn ts_node_child_count(arg1: TSNode) -> u32;
}
extern "C" {
pub fn ts_node_named_child_count(arg1: TSNode) -> u32;
}
extern "C" {
pub fn ts_node_next_sibling(arg1: TSNode) -> TSNode;
}
extern "C" {
pub fn ts_node_next_named_sibling(arg1: TSNode) -> TSNode;
}
extern "C" {
pub fn ts_node_prev_sibling(arg1: TSNode) -> TSNode;
}
extern "C" {
pub fn ts_node_prev_named_sibling(arg1: TSNode) -> TSNode;
}
extern "C" {
pub fn ts_node_first_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode;
}
extern "C" {
pub fn ts_node_first_named_child_for_byte(arg1: TSNode, arg2: u32) -> TSNode;
}
extern "C" {
pub fn ts_node_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode;
}
extern "C" {
pub fn ts_node_named_descendant_for_byte_range(arg1: TSNode, arg2: u32, arg3: u32) -> TSNode;
}
extern "C" {
pub fn ts_node_descendant_for_point_range(arg1: TSNode, arg2: TSPoint, arg3: TSPoint)
-> TSNode;
}
extern "C" {
pub fn ts_node_named_descendant_for_point_range(
arg1: TSNode,
arg2: TSPoint,
arg3: TSPoint,
) -> TSNode;
}
extern "C" {
pub fn ts_node_edit(arg1: *mut TSNode, arg2: *const TSInputEdit);
}
extern "C" {
pub fn ts_tree_cursor_new(arg1: TSNode) -> TSTreeCursor;
}
extern "C" {
pub fn ts_tree_cursor_delete(arg1: *mut TSTreeCursor);
}
extern "C" {
pub fn ts_tree_cursor_goto_first_child(arg1: *mut TSTreeCursor) -> bool;
}
extern "C" {
pub fn ts_tree_cursor_goto_first_child_for_byte(arg1: *mut TSTreeCursor, arg2: u32) -> i64;
}
extern "C" {
pub fn ts_tree_cursor_goto_next_sibling(arg1: *mut TSTreeCursor) -> bool;
}
extern "C" {
pub fn ts_tree_cursor_goto_parent(arg1: *mut TSTreeCursor) -> bool;
}
extern "C" {
pub fn ts_tree_cursor_current_node(arg1: *const TSTreeCursor) -> TSNode;
}
extern "C" {
pub fn ts_language_symbol_count(arg1: *const TSLanguage) -> u32;
}
extern "C" {
pub fn ts_language_symbol_name(
arg1: *const TSLanguage,
arg2: TSSymbol,
) -> *const ::std::os::raw::c_char;
}
extern "C" {
pub fn ts_language_symbol_for_name(
arg1: *const TSLanguage,
arg2: *const ::std::os::raw::c_char,
) -> TSSymbol;
}
extern "C" {
pub fn ts_language_symbol_type(arg1: *const TSLanguage, arg2: TSSymbol) -> TSSymbolType;
}
extern "C" {
pub fn ts_language_version(arg1: *const TSLanguage) -> u32;
}
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 9;

4
lib/binding/ffi.rs Normal file
View file

@ -0,0 +1,4 @@
#![allow(dead_code)]
#![allow(non_upper_case_globals)]
include!("./bindings.rs");

1349
lib/binding/lib.rs Normal file

File diff suppressed because it is too large Load diff

26
lib/build.rs Normal file
View file

@ -0,0 +1,26 @@
extern crate cc;
use std::env;
use std::path::PathBuf;
fn main() {
let mut config = cc::Build::new();
let src_path: PathBuf = ["src"].iter().collect();
config
.define("UTF8PROC_STATIC", "")
.flag_if_supported("-std=c99")
.flag_if_supported("-Wno-unused-parameter")
.include("include")
.include("utf8proc")
.file(src_path.join("runtime.c"));
if env::var("RUST_TREE_SITTER_TEST").is_ok() {
let parser_dir: PathBuf = ["fixtures", "tree-sitter-rust", "src"].iter().collect();
config
.file(parser_dir.join("parser.c"))
.file(parser_dir.join("scanner.c"));
}
config.compile("tree-sitter-runtime");
}

View file

@ -1,187 +0,0 @@
{
'targets': [
{
'target_name': 'compiler',
'type': 'static_library',
'include_dirs': [
'include',
'src',
'externals/utf8proc',
'externals/json-parser',
],
'sources': [
'src/compiler/build_tables/lex_item.cc',
'src/compiler/build_tables/lex_item_transitions.cc',
'src/compiler/build_tables/lex_table_builder.cc',
'src/compiler/build_tables/lookahead_set.cc',
'src/compiler/build_tables/parse_item.cc',
'src/compiler/build_tables/parse_item_set_builder.cc',
'src/compiler/build_tables/parse_table_builder.cc',
'src/compiler/build_tables/property_table_builder.cc',
'src/compiler/build_tables/rule_can_be_blank.cc',
'src/compiler/compile.cc',
'src/compiler/generate_code/c_code.cc',
'src/compiler/generate_code/property_table_json.cc',
'src/compiler/lex_table.cc',
'src/compiler/log.cc',
'src/compiler/parse_json.cc',
'src/compiler/parse_table.cc',
'src/compiler/precedence_range.cc',
'src/compiler/prepare_grammar/expand_repeats.cc',
'src/compiler/prepare_grammar/expand_tokens.cc',
'src/compiler/prepare_grammar/extract_choices.cc',
'src/compiler/prepare_grammar/extract_simple_aliases.cc',
'src/compiler/prepare_grammar/extract_tokens.cc',
'src/compiler/prepare_grammar/flatten_grammar.cc',
'src/compiler/prepare_grammar/intern_symbols.cc',
'src/compiler/prepare_grammar/normalize_rules.cc',
'src/compiler/prepare_grammar/parse_regex.cc',
'src/compiler/prepare_grammar/prepare_grammar.cc',
'src/compiler/prepare_grammar/token_description.cc',
'src/compiler/rule.cc',
'src/compiler/syntax_grammar.cc',
'src/compiler/rules/character_set.cc',
'src/compiler/rules/choice.cc',
'src/compiler/rules/metadata.cc',
'src/compiler/rules/repeat.cc',
'src/compiler/rules/seq.cc',
'src/compiler/util/string_helpers.cc',
'externals/utf8proc/utf8proc.c',
'externals/json-parser/json.c',
],
'cflags_cc': [
'-std=c++14',
],
'xcode_settings': {
'CLANG_CXX_LANGUAGE_STANDARD': 'c++14',
'GCC_ENABLE_CPP_EXCEPTIONS': 'NO',
},
'direct_dependent_settings': {
'include_dirs': [
'include'
],
},
'conditions': [
# For 64-bit builds on appveyor, we need to explicitly tell gyp
# to generate an x64 target in the MSVS project file.
['"<!(echo %PLATFORM%)" == "x64"', {
'msvs_configuration_platform': 'x64',
}],
# Mac OS has an old version of libstdc++ that doesn't support c++11.
# libc++ is only present on 10.7 and later.
['OS == "mac"', {
'cflags_cc': [ '-stdlib=libc++' ],
'xcode_settings': {
'CLANG_CXX_LIBRARY': 'libc++',
'MACOSX_DEPLOYMENT_TARGET': '10.7',
},
'direct_dependent_settings': {
'cflags_cc': [ '-stdlib=libc++' ],
'xcode_settings': {
'CLANG_CXX_LIBRARY': 'libc++',
},
},
}]
],
},
{
'target_name': 'runtime',
'type': 'static_library',
'include_dirs': [
'include',
'src',
'externals/utf8proc',
],
'sources': [
'src/runtime/get_changed_ranges.c',
'src/runtime/language.c',
'src/runtime/lexer.c',
'src/runtime/node.c',
'src/runtime/stack.c',
'src/runtime/parser.c',
'src/runtime/subtree.c',
'src/runtime/tree.c',
'src/runtime/tree_cursor.c',
'src/runtime/utf16.c',
'externals/utf8proc/utf8proc.c',
],
'cflags_c': [
'-std=c99', '-U_FORTIFY_SOURCE', '-D_FORTIFY_SOURCE=1'
],
'ldflags': [
'-g',
],
'direct_dependent_settings': {
'include_dirs': [
'include'
],
},
'conditions': [
# For 64-bit builds on appveyor, we need to explicitly tell gyp
# to generate an x64 target in the MSVS project file.
['"<!(echo %PLATFORM%)" == "x64"', {
'msvs_configuration_platform': 'x64',
}],
],
},
],
'target_defaults': {
'default_configuration': 'Release',
'configurations': {
'Debug': {
'cflags': [ '-g' ],
'ldflags': [ '-g' ],
'xcode_settings': {
'ARCHS': ['x86_64'],
'OTHER_LDFLAGS': ['-g'],
'GCC_OPTIMIZATION_LEVEL': '0',
},
},
'Test': {
'defines': ['TREE_SITTER_TEST=true'],
'cflags': [ '-g' ],
'ldflags': [ '-g' ],
'xcode_settings': {
'ARCHS': ['x86_64'],
'OTHER_LDFLAGS': ['-g'],
'GCC_OPTIMIZATION_LEVEL': '0',
'OTHER_CPLUSPLUSFLAGS': ['-fsanitize=address'],
},
},
'Fuzz': {
'cflags': ['<!@(echo $CFLAGS)'],
'ldflags': ['<!@(echo $CFLAGS)'],
},
'Release': {
'cflags': [ '-O2', '-fno-strict-aliasing' ],
'cflags!': [ '-O3', '-fstrict-aliasing' ],
'xcode_settings': {
'ARCHS': ['x86_64'],
},
},
},
'cflags': [
'-Wall',
'-Wextra',
'-Wno-unused-parameter'
],
'defines': ['UTF8PROC_STATIC'],
'xcode_settings': {
'ALWAYS_SEARCH_USER_PATHS': 'NO',
'WARNING_CFLAGS': [
'-Wall',
'-Wextra',
'-Wno-unused-parameter'
],
}
}
}

17
script/bindgen.sh Executable file
View file

@ -0,0 +1,17 @@
#!/bin/bash
output_path=src/bindings.rs
header_path='vendor/tree-sitter/include/tree_sitter/runtime.h'
bindgen \
--no-layout-tests \
--whitelist-type '^TS.*' \
--whitelist-function '^ts_.*' \
--opaque-type FILE \
--distrust-clang-mangling \
$header_path > $output_path
echo "" >> $output_path
version_constant='TREE_SITTER_LANGUAGE_VERSION'
version_number=$(egrep "#define $version_constant (.*)" $header_path | cut -d' ' -f3)
echo "pub const $version_constant: usize = $version_number;" >> $output_path

16
script/fetch-test-fixtures.cmd Executable file
View file

@ -0,0 +1,16 @@
@Echo off
SETLOCAL
Set grammar_dir=fixtures\tree-sitter-rust
Set grammar_url=https://github.com/tree-sitter/tree-sitter-rust
@IF NOT EXIST %grammar_dir% (
git clone %grammar_url% %grammar_dir% --depth=1
)
pushd %grammar_dir%
git fetch origin master --depth=1
git reset --hard origin/master
popd
ENDLOCAL

14
script/fetch-test-fixtures.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/bash
grammar_dir='fixtures/tree-sitter-rust'
grammar_url='https://github.com/tree-sitter/tree-sitter-rust'
if [ ! -d $grammar_dir ]; then
git clone $grammar_url $grammar_dir --depth=1
fi
(
cd $grammar_dir;
git fetch origin master --depth=1
git reset --hard origin/master;
)

3
script/test.sh Executable file
View file

@ -0,0 +1,3 @@
#!/bin/bash
RUST_TREE_SITTER_TEST=1 cargo test $@

View file

@ -1,137 +0,0 @@
#include "compiler/build_tables/lex_item.h"
#include <unordered_set>
#include "compiler/build_tables/lex_item_transitions.h"
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/rule.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace build_tables {
using std::map;
using std::string;
using std::unordered_set;
using rules::CharacterSet;
using rules::Symbol;
using rules::Metadata;
LexItem::LexItem(const rules::Symbol &lhs, const rules::Rule &rule)
: lhs(lhs), rule(rule) {}
bool LexItem::operator==(const LexItem &other) const {
return lhs == other.lhs && rule == other.rule;
}
using CompletionStatus = LexItem::CompletionStatus;
static CompletionStatus get_completion_status(const rules::Rule &rule) {
return rule.match(
[](rules::Choice choice) {
for (const auto &element : choice.elements) {
auto status = get_completion_status(element);
if (status.is_done) return status;
}
return CompletionStatus{false, PrecedenceRange()};
},
[](rules::Metadata metadata) {
CompletionStatus result = get_completion_status(*metadata.rule);
if (result.is_done && result.precedence.empty && metadata.params.has_precedence) {
result.precedence.add(metadata.params.precedence);
}
return result;
},
[](rules::Repeat repeat) {
return get_completion_status(*repeat.rule);
},
[](rules::Seq sequence) {
CompletionStatus left_status = get_completion_status(*sequence.left);
if (left_status.is_done) {
return get_completion_status(*sequence.right);
} else {
return CompletionStatus{false, PrecedenceRange()};
}
},
[](rules::Blank blank) {
return CompletionStatus{true, PrecedenceRange()};
},
[](rules::CharacterSet) {
return CompletionStatus{false, PrecedenceRange()};
},
[](auto) {
return CompletionStatus{false, PrecedenceRange()};
}
);
}
LexItem::CompletionStatus LexItem::completion_status() const {
return get_completion_status(rule);
}
LexItemSet::LexItemSet() {}
LexItemSet::LexItemSet(const unordered_set<LexItem> &entries)
: entries(entries) {}
bool LexItemSet::operator==(const LexItemSet &other) const {
return entries == other.entries;
}
bool LexItem::is_in_separators() const {
if (!rule.is<Metadata>()) return false;
auto &metadata = rule.get_unchecked<Metadata>();
return !metadata.params.is_main_token;
}
bool LexItemSet::has_items_in_separators() const {
for (const LexItem &item : entries) {
if (item.is_in_separators()) return true;
}
return false;
}
LexItemSet::TransitionMap LexItemSet::transitions() const {
TransitionMap result;
for (const LexItem &item : entries) {
lex_item_transitions(&result, item);
}
return result;
}
bool LexItemSet::Transition::operator==(const LexItemSet::Transition &other) const {
return destination == other.destination && precedence == other.precedence &&
in_main_token == other.in_main_token;
}
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::util::hash_combine;
using tree_sitter::util::symmetric_hash_combine;
using tree_sitter::build_tables::LexItem;
using tree_sitter::build_tables::LexItemSet;
size_t hash<LexItem>::operator()(const LexItem &item) const {
size_t result = 0;
hash_combine(&result, item.lhs.index);
hash_combine(&result, item.rule);
return result;
}
size_t hash<LexItemSet>::operator()(const LexItemSet &item_set) const {
size_t result = 0;
hash_combine(&result, item_set.entries.size());
for (const auto &item : item_set.entries)
symmetric_hash_combine(&result, item);
return result;
}
} // namespace std

View file

@ -1,81 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_LEX_ITEM_H_
#define COMPILER_BUILD_TABLES_LEX_ITEM_H_
#include <unordered_set>
#include <map>
#include <utility>
#include <string>
#include "compiler/rule.h"
#include "compiler/precedence_range.h"
namespace tree_sitter {
namespace build_tables {
class LexItem {
public:
LexItem(const rules::Symbol &, const rules::Rule &);
struct CompletionStatus {
bool is_done;
PrecedenceRange precedence;
};
bool operator==(const LexItem &other) const;
CompletionStatus completion_status() const;
bool is_in_separators() const;
rules::Symbol lhs;
rules::Rule rule;
};
} // namespace build_tables
} // namespace tree_sitter
namespace std {
template <>
struct hash<tree_sitter::build_tables::LexItem> {
size_t operator()(const tree_sitter::build_tables::LexItem &) const;
};
} // namespace std
namespace tree_sitter {
namespace build_tables {
class LexItemSet {
public:
LexItemSet();
explicit LexItemSet(const std::unordered_set<LexItem> &);
struct Transition;
typedef std::map<rules::CharacterSet, Transition> TransitionMap;
bool operator==(const LexItemSet &) const;
TransitionMap transitions() const;
bool has_items_in_separators() const;
std::unordered_set<LexItem> entries;
};
struct LexItemSet::Transition {
LexItemSet destination;
PrecedenceRange precedence;
bool in_main_token;
bool operator==(const LexItemSet::Transition &) const;
};
} // namespace build_tables
} // namespace tree_sitter
namespace std {
template <>
struct hash<tree_sitter::build_tables::LexItemSet> {
size_t operator()(const tree_sitter::build_tables::LexItemSet &) const;
};
} // namespace std
#endif // COMPILER_BUILD_TABLES_LEX_ITEM_H_

View file

@ -1,195 +0,0 @@
#include "compiler/build_tables/lex_item_transitions.h"
#include <map>
#include <vector>
#include <functional>
#include <utility>
#include "compiler/build_tables/rule_can_be_blank.h"
#include "compiler/rule.h"
#include "compiler/build_tables/lex_item.h"
namespace tree_sitter {
namespace build_tables {
using std::function;
using std::map;
using std::move;
using std::pair;
using std::vector;
using rules::CharacterSet;
using rules::Rule;
using Transition = LexItemSet::Transition;
using TransitionMap = LexItemSet::TransitionMap;
class TransitionBuilder {
TransitionMap *transitions;
const rules::Symbol &item_lhs;
vector<int> *precedence_stack;
bool in_main_token;
inline Transition transform_transition(const Transition &transition,
const function<Rule(const Rule &)> &callback) {
LexItemSet destination;
for (const LexItem &item : transition.destination.entries) {
destination.entries.insert(LexItem(item.lhs, callback(item.rule)));
}
return Transition{destination, transition.precedence, transition.in_main_token};
}
void add_transition(TransitionMap *transitions, CharacterSet new_characters,
Transition new_transition) {
vector<pair<CharacterSet, Transition>> new_entries;
auto iter = transitions->begin();
while (iter != transitions->end()) {
CharacterSet existing_characters = iter->first;
Transition &existing_transition = iter->second;
CharacterSet intersecting_characters =
existing_characters.remove_set(new_characters);
if (intersecting_characters.is_empty()) {
iter++;
continue;
}
new_characters.remove_set(intersecting_characters);
if (!existing_characters.is_empty())
new_entries.push_back({
existing_characters, existing_transition,
});
existing_transition.destination.entries.insert(
new_transition.destination.entries.begin(),
new_transition.destination.entries.end());
existing_transition.precedence.add(new_transition.precedence);
existing_transition.in_main_token |= new_transition.in_main_token;
new_entries.push_back({
intersecting_characters, existing_transition,
});
transitions->erase(iter++);
}
transitions->insert(new_entries.begin(), new_entries.end());
if (!new_characters.is_empty())
transitions->insert({ new_characters, new_transition });
}
public:
void apply(const Rule &rule) {
rule.match(
[](const rules::Blank &) {},
[this](const rules::CharacterSet &character_set) {
PrecedenceRange precedence;
if (!precedence_stack->empty()) {
precedence.add(precedence_stack->back());
}
add_transition(
transitions,
character_set,
Transition{
LexItemSet({ LexItem(item_lhs, rules::Blank{}) }),
precedence,
in_main_token,
}
);
},
[this](const rules::Choice &choice) {
for (const auto &element : choice.elements) {
apply(element);
}
},
[this](const rules::Seq &sequence) {
TransitionMap left_transitions;
TransitionBuilder(&left_transitions, this).apply(*sequence.left);
for (const auto &pair : left_transitions) {
add_transition(
transitions,
pair.first,
transform_transition(pair.second, [&sequence](Rule rule) -> Rule {
return Rule::seq({rule, *sequence.right});
})
);
}
if (rule_can_be_blank(*sequence.left)) {
apply(*sequence.right);
}
},
[this](const rules::Repeat &repeat) {
TransitionMap content_transitions;
TransitionBuilder(&content_transitions, this).apply(*repeat.rule);
for (const auto &pair : content_transitions) {
add_transition(transitions, pair.first, pair.second);
add_transition(
transitions, pair.first,
transform_transition(pair.second, [&repeat](Rule item_rule) {
return Rule::seq({ item_rule, repeat });
})
);
}
},
[this](const rules::Metadata &metadata) {
bool has_active_precedence = metadata.params.is_active;
if (has_active_precedence)
precedence_stack->push_back(metadata.params.precedence);
if (metadata.params.is_main_token)
in_main_token = true;
auto params = metadata.params;
if (params.has_precedence)
params.is_active = true;
TransitionMap content_transitions;
TransitionBuilder(&content_transitions, this).apply(*metadata.rule);
for (const auto &pair : content_transitions) {
add_transition(
transitions, pair.first,
transform_transition(pair.second, [&params](Rule rule) {
return rules::Metadata::merge(move(rule), params);
})
);
}
if (has_active_precedence) {
precedence_stack->pop_back();
}
},
[](auto) {}
);
}
TransitionBuilder(TransitionMap *transitions, const rules::Symbol &item_lhs,
vector<int> *precedence_stack, bool in_main_token)
: transitions(transitions),
item_lhs(item_lhs),
precedence_stack(precedence_stack),
in_main_token(in_main_token) {}
TransitionBuilder(TransitionMap *transitions, TransitionBuilder *other)
: transitions(transitions),
item_lhs(other->item_lhs),
precedence_stack(other->precedence_stack),
in_main_token(other->in_main_token) {}
};
void lex_item_transitions(TransitionMap *transitions, const LexItem &item) {
vector<int> precedence_stack;
TransitionBuilder(transitions, item.lhs, &precedence_stack, false).apply(item.rule);
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,14 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_LEX_ITEM_TRANSITIONS_H_
#define COMPILER_BUILD_TABLES_LEX_ITEM_TRANSITIONS_H_
#include "compiler/build_tables/lex_item.h"
namespace tree_sitter {
namespace build_tables {
void lex_item_transitions(LexItemSet::TransitionMap *transitions, const LexItem &);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_LEX_ITEM_TRANSITIONS_H_

View file

@ -1,687 +0,0 @@
#include "compiler/build_tables/lex_table_builder.h"
#include <climits>
#include <map>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <cwctype>
#include <vector>
#include "compiler/build_tables/lex_item.h"
#include "compiler/build_tables/lookahead_set.h"
#include "compiler/lexical_grammar.h"
#include "compiler/log.h"
#include "compiler/parse_table.h"
#include "compiler/rule.h"
#include "utf8proc.h"
namespace tree_sitter {
namespace build_tables {
using std::map;
using std::move;
using std::pair;
using std::set;
using std::string;
using std::vector;
using std::unordered_map;
using std::unordered_set;
using std::unique_ptr;
using std::iswalpha;
using rules::Rule;
using rules::Blank;
using rules::Choice;
using rules::CharacterSet;
using rules::Repeat;
using rules::Symbol;
using rules::Metadata;
using rules::Seq;
bool CoincidentTokenIndex::contains(Symbol a, Symbol b) const {
return a == b || !states_with(a, b).empty();
}
const unordered_set<ParseStateId> &CoincidentTokenIndex::states_with(Symbol a, Symbol b) const {
static const unordered_set<ParseStateId> NO_STATES;
if (a.index > b.index) std::swap(a, b);
auto iter = entries.find({a.index, b.index});
if (iter == entries.end()) {
return NO_STATES;
} else {
return iter->second;
}
}
class LexTableBuilderImpl : public LexTableBuilder {
enum ConflictStatus {
DoesNotMatch = 0,
MatchesShorterStringWithinSeparators = 1 << 0,
MatchesSameString = 1 << 1,
MatchesLongerString = 1 << 2,
MatchesLongerStringWithValidNextChar = 1 << 3,
};
LexTable main_lex_table;
LexTable keyword_lex_table;
const LexicalGrammar grammar;
vector<Rule> separator_rules;
unordered_map<LexItemSet, LexStateId> main_lex_state_ids;
unordered_map<LexItemSet, LexStateId> keyword_lex_state_ids;
CharacterSet separator_start_characters;
vector<CharacterSet> starting_characters_by_token;
vector<CharacterSet> following_characters_by_token;
const CoincidentTokenIndex &coincident_token_index;
ParseTable *parse_table;
vector<ConflictStatus> conflict_matrix;
bool conflict_detection_mode;
LookaheadSet keyword_symbols;
Symbol word_token;
char encoding_buffer[8];
public:
LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const unordered_map<Symbol, LookaheadSet> &following_tokens_by_token,
const CoincidentTokenIndex &coincident_token_index,
ParseTable *parse_table)
: grammar(lexical_grammar),
starting_characters_by_token(lexical_grammar.variables.size()),
following_characters_by_token(lexical_grammar.variables.size()),
coincident_token_index(coincident_token_index),
parse_table(parse_table),
conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch),
conflict_detection_mode(false),
word_token(syntax_grammar.word_token) {
// Compute the possible separator rules and the set of separator characters that can occur
// immediately after any token.
for (const auto &rule : grammar.separators) {
separator_rules.push_back(Repeat{rule});
add_starting_characters(&separator_start_characters, rule);
}
separator_rules.push_back(Blank{});
// Compute the set of characters that each token can start with and the set of non-separator
// characters that can follow each token. Also identify all of the tokens that can be
// considered 'keywords'.
LOG("characterizing tokens");
for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) {
Symbol token = Symbol::terminal(i);
add_starting_characters(&starting_characters_by_token[i], grammar.variables[i].rule);
const auto &following_tokens = following_tokens_by_token.find(token);
if (following_tokens != following_tokens_by_token.end()) {
following_tokens->second.for_each([&](Symbol following_token) {
add_starting_characters(
&following_characters_by_token[i],
grammar.variables[following_token.index].rule
);
return true;
});
}
}
// For each pair of tokens, generate a lex table for just those two tokens and record what
// conflicts arise.
LOG_START("detecting conflicts between tokens");
conflict_detection_mode = true;
for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {
for (Symbol::Index j = 0; j < i; j++) {
if (starting_characters_by_token[i].intersects(starting_characters_by_token[j]) ||
starting_characters_by_token[i].intersects(separator_start_characters) ||
starting_characters_by_token[j].intersects(separator_start_characters)) {
clear();
add_lex_state(main_lex_table, item_set_for_terminals(LookaheadSet({
Symbol::terminal(i),
Symbol::terminal(j)
}), true));
}
}
}
LOG_END();
if (word_token != rules::NONE()) identify_keywords();
}
void identify_keywords() {
LookaheadSet homonyms;
for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) {
Symbol other_token = Symbol::terminal(j);
// For now, only consider tokens as 'keywords' if they start with letters or underscores.
bool starts_with_letter = !starting_characters_by_token[j].includes_all;
for (auto character : starting_characters_by_token[j].included_chars) {
if (!iswalpha(character) && character != '_') {
starts_with_letter = false;
break;
}
}
if (!starts_with_letter) continue;
if (get_conflict_status(word_token, other_token) == MatchesSameString) {
homonyms.insert(other_token);
}
}
homonyms.for_each([&](Symbol homonym1) {
homonyms.for_each([&](Symbol homonym2) {
if (get_conflict_status(homonym1, homonym2) & MatchesSameString) {
LOG(
"conflict between homonyms %s %s",
token_name(homonym1).c_str(),
token_name(homonym2).c_str()
);
homonyms.remove(homonym1);
}
return false;
});
return true;
});
for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) {
Symbol other_token = Symbol::terminal(j);
if (other_token == word_token || homonyms.contains(other_token)) continue;
bool word_rule_shadows_other = get_conflict_status(other_token, word_token);
bool other_shadows_word_rule = get_conflict_status(word_token, other_token);
if (word_rule_shadows_other || other_shadows_word_rule) {
homonyms.for_each([&](Symbol homonym) {
bool word_rule_was_already_present = true;
for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) {
if (!parse_table->states[state_id].has_terminal_entry(word_token)) {
word_rule_was_already_present = false;
break;
}
}
if (word_rule_was_already_present) return true;
bool homonym_shadows_other = get_conflict_status(other_token, homonym);
bool other_shadows_homonym = get_conflict_status(homonym, other_token);
if (word_rule_shadows_other != homonym_shadows_other) {
homonyms.remove(homonym);
LOG(
"remove %s because word_token would shadow %s",
token_name(homonym).c_str(),
token_name(other_token).c_str()
);
} else if (other_shadows_word_rule != other_shadows_homonym) {
homonyms.remove(homonym);
LOG(
"remove %s because %s would shadow word_token",
token_name(homonym).c_str(),
token_name(other_token).c_str()
);
}
return true;
});
}
}
if (!homonyms.empty()) {
LOG_START("found keywords:");
homonyms.for_each([&](Symbol homonym) {
LOG("%s", token_name(homonym).c_str());
return true;
});
LOG_END();
keyword_symbols = homonyms;
}
}
BuildResult build() {
clear();
conflict_detection_mode = false;
vector<pair<LookaheadSet, vector<ParseState *>>> starting_token_sets;
for (ParseState &parse_state : parse_table->states) {
LookaheadSet token_set;
for (auto &entry : parse_state.terminal_entries) {
if (word_token.is_terminal() && keyword_symbols.contains(entry.first)) {
token_set.insert(word_token);
} else {
token_set.insert(entry.first);
}
}
bool did_merge = false;
for (auto &pair : starting_token_sets) {
if (merge_token_set(&pair.first, token_set)) {
did_merge = true;
pair.second.push_back(&parse_state);
break;
}
}
if (!did_merge) starting_token_sets.push_back({token_set, {&parse_state}});
}
for (auto &pair : starting_token_sets) {
LexStateId state_id = add_lex_state(main_lex_table, item_set_for_terminals(pair.first, true));
for (ParseState *parse_state : pair.second) {
parse_state->lex_state_id = state_id;
}
}
add_lex_state(keyword_lex_table, item_set_for_terminals(keyword_symbols, false));
mark_fragile_tokens();
remove_duplicate_lex_states(main_lex_table);
return {main_lex_table, keyword_lex_table, word_token};
}
bool does_token_shadow_other(Symbol token, Symbol shadowed_token) const {
if (keyword_symbols.contains(shadowed_token) &&
(keyword_symbols.contains(token) || token == word_token)) return false;
return get_conflict_status(shadowed_token, token) & (
MatchesShorterStringWithinSeparators |
MatchesLongerStringWithValidNextChar
);
}
bool does_token_match_same_string_as_other(Symbol token, Symbol shadowed_token) const {
if (shadowed_token == word_token && keyword_symbols.contains(token)) return false;
return get_conflict_status(shadowed_token, token) & MatchesSameString;
}
private:
ConflictStatus get_conflict_status(Symbol shadowed_token, Symbol other_token) const {
if (shadowed_token.is_built_in() ||
other_token.is_built_in() ||
!shadowed_token.is_terminal() ||
!other_token.is_terminal()) return DoesNotMatch;
unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index;
return conflict_matrix[index];
}
bool record_conflict(Symbol shadowed_token, Symbol other_token, ConflictStatus status) {
if (!conflict_detection_mode) return false;
unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index;
bool was_set = conflict_matrix[index] & status;
conflict_matrix[index] = static_cast<ConflictStatus>(conflict_matrix[index] | status);
return !was_set;
}
LexStateId add_lex_state(LexTable &lex_table, const LexItemSet &item_set) {
auto &lex_state_ids = &lex_table == &main_lex_table ?
main_lex_state_ids :
keyword_lex_state_ids;
const auto &pair = lex_state_ids.find(item_set);
if (pair == lex_state_ids.end()) {
LexStateId state_id = lex_table.states.size();
lex_table.states.push_back(LexState());
lex_state_ids[item_set] = state_id;
add_accept_token_actions(lex_table, item_set, state_id);
add_advance_actions(lex_table, item_set, state_id);
return state_id;
} else {
return pair->second;
}
}
void add_advance_actions(LexTable &lex_table, const LexItemSet &item_set, LexStateId state_id) {
for (const auto &pair : item_set.transitions()) {
const CharacterSet &characters = pair.first;
const LexItemSet::Transition &transition = pair.second;
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action;
if (accept_action.is_present()) {
bool prefer_advancing = action.precedence_range.max >= accept_action.precedence;
if (conflict_detection_mode) {
bool next_item_set_can_yield_this_token = false;
for (const LexItem &item : transition.destination.entries) {
if (item.lhs == accept_action.symbol) {
next_item_set_can_yield_this_token = true;
} else if (!prefer_advancing && item_set.has_items_in_separators()) {
record_conflict(item.lhs, accept_action.symbol, MatchesShorterStringWithinSeparators);
}
}
if (prefer_advancing && !next_item_set_can_yield_this_token) {
auto advance_symbol = transition.destination.entries.begin()->lhs;
auto &following_chars = following_characters_by_token[accept_action.symbol.index];
CharacterSet conflicting_following_chars = characters.intersection(following_chars);
if (conflicting_following_chars.is_empty()) {
conflicting_following_chars = characters.intersection(separator_start_characters);
}
if (conflicting_following_chars.is_empty()) {
record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString);
} else {
if (record_conflict(
accept_action.symbol,
advance_symbol,
MatchesLongerStringWithValidNextChar
)) {
if (!conflicting_following_chars.included_chars.empty()) {
LOG(
"%s shadows %s followed by '%s'",
token_name(advance_symbol).c_str(),
token_name(accept_action.symbol).c_str(),
log_char(*conflicting_following_chars.included_chars.begin())
);
}
}
}
}
}
if (!prefer_advancing) continue;
}
action.state_index = add_lex_state(lex_table, transition.destination);
lex_table.states[state_id].advance_actions[characters] = action;
}
}
void add_accept_token_actions(LexTable &lex_table, const LexItemSet &item_set, LexStateId state_id) {
for (const LexItem &item : item_set.entries) {
LexItem::CompletionStatus completion_status = item.completion_status();
if (completion_status.is_done) {
AcceptTokenAction action(item.lhs, completion_status.precedence.max);
if (!item.lhs.is_built_in()) {
const LexicalVariable &variable = grammar.variables[item.lhs.index];
if (variable.is_string) action.implicit_precedence += 2;
if (is_immediate_token(variable.rule)) action.implicit_precedence += 1;
}
AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
if (existing_action.is_present()) {
if (should_replace_accept_action(existing_action, action)) {
if (record_conflict(existing_action.symbol, action.symbol, MatchesSameString)) {
LOG(
"%s shadows %s - same length",
token_name(action.symbol).c_str(),
token_name(existing_action.symbol).c_str()
);
}
} else {
if (record_conflict(action.symbol, existing_action.symbol, MatchesSameString)) {
LOG(
"%s shadows %s - same length",
token_name(existing_action.symbol).c_str(),
token_name(action.symbol).c_str()
);
}
continue;
}
}
lex_table.states[state_id].accept_action = action;
}
}
}
void mark_fragile_tokens() {
for (ParseState &state : parse_table->states) {
for (auto &entry : state.terminal_entries) {
Symbol token = entry.first;
if (token.is_external() || token.is_built_in()) continue;
for (unsigned i = 0; i < grammar.variables.size(); i++) {
Symbol other_token = Symbol::terminal(i);
ConflictStatus status = get_conflict_status(token, other_token);
if (status != ConflictStatus::DoesNotMatch &&
state.terminal_entries.count(other_token)) {
entry.second.reusable = false;
break;
}
}
}
}
}
bool merge_token_set(LookaheadSet *left, const LookaheadSet &right) const {
auto CannotDistinguish = (
MatchesShorterStringWithinSeparators |
MatchesSameString |
MatchesLongerStringWithValidNextChar
);
bool is_compatible = true;
left->for_each_difference(right, [&](bool in_left, Symbol different_symbol) {
if (!different_symbol.is_external() && !different_symbol.is_built_in()) {
const LookaheadSet &existing_set = in_left ? right : *left;
existing_set.for_each([&](Symbol existing_symbol) {
if ((get_conflict_status(existing_symbol, different_symbol) & CannotDistinguish) ||
!coincident_token_index.contains(different_symbol, existing_symbol)) {
is_compatible = false;
return false;
}
return true;
});
if (!is_compatible) return false;
}
return true;
});
if (is_compatible) left->insert_all(right);
return is_compatible;
}
void remove_duplicate_lex_states(LexTable &lex_table) {
for (LexState &state : lex_table.states) {
state.accept_action.precedence = 0;
state.accept_action.implicit_precedence = 0;
}
map<LexStateId, LexStateId> replacements;
while (true) {
map<LexStateId, LexStateId> duplicates;
for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
for (LexStateId j = 0; j < i; j++) {
if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) {
duplicates.insert({ i, j });
break;
}
}
}
if (duplicates.empty()) break;
map<LexStateId, LexStateId> new_replacements;
for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
LexStateId new_state_index = i;
auto duplicate = duplicates.find(i);
if (duplicate != duplicates.end()) {
new_state_index = duplicate->second;
}
size_t prior_removed = 0;
for (const auto &duplicate : duplicates) {
if (duplicate.first >= new_state_index) break;
prior_removed++;
}
new_state_index -= prior_removed;
new_replacements.insert({i, new_state_index});
replacements.insert({ i, new_state_index });
for (auto &replacement : replacements) {
if (replacement.second == i) {
replacement.second = new_state_index;
}
}
}
for (auto &state : lex_table.states) {
for (auto &entry : state.advance_actions) {
auto new_replacement = new_replacements.find(entry.second.state_index);
if (new_replacement != new_replacements.end()) {
entry.second.state_index = new_replacement->second;
}
}
}
for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) {
lex_table.states.erase(lex_table.states.begin() + i->first);
}
}
for (ParseState &parse_state : parse_table->states) {
auto replacement = replacements.find(parse_state.lex_state_id);
if (replacement != replacements.end()) {
parse_state.lex_state_id = replacement->second;
}
}
}
bool is_immediate_token(const Rule &rule) const {
return rule.match(
[](const Metadata &metadata) {
return metadata.params.is_main_token;
},
[](auto rule) {
return false;
}
);
}
LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) {
LexItemSet result;
terminals.for_each([&](Symbol symbol) {
if (symbol.is_terminal()) {
for (auto &&rule : rules_for_symbol(symbol)) {
if (with_separators && !is_immediate_token(rule)) {
for (const auto &separator_rule : separator_rules) {
result.entries.insert(LexItem(
symbol,
Metadata::separator(
Rule::seq({
separator_rule,
Metadata::main_token(move(rule))
})
)
));
}
} else {
result.entries.insert(LexItem(symbol, Metadata::main_token(move(rule))));
}
}
}
return true;
});
return result;
}
static void add_starting_characters(CharacterSet *characters, const Rule &rule) {
rule.match(
[characters](const Seq &sequence) {
add_starting_characters(characters, *sequence.left);
},
[characters](const rules::Choice &rule) {
for (const auto &element : rule.elements) {
add_starting_characters(characters, element);
}
},
[characters](const rules::Repeat &rule) {
add_starting_characters(characters, *rule.rule);
},
[characters](const rules::Metadata &rule) {
add_starting_characters(characters, *rule.rule);
},
[characters](const rules::CharacterSet &rule) {
characters->add_set(rule);
},
[](auto) {}
);
}
vector<Rule> rules_for_symbol(const rules::Symbol &symbol) {
if (symbol == rules::END_OF_INPUT()) {
return { CharacterSet().include(0) };
}
return grammar.variables[symbol.index].rule.match(
[](const Choice &choice) {
return choice.elements;
},
[](auto rule) {
return vector<Rule>{ rule };
}
);
}
bool should_replace_accept_action(const AcceptTokenAction &old_action,
const AcceptTokenAction &new_action) {
if (new_action.precedence > old_action.precedence) return true;
if (new_action.precedence < old_action.precedence) return false;
if (new_action.implicit_precedence > old_action.implicit_precedence) return true;
if (new_action.implicit_precedence < old_action.implicit_precedence) return false;
return new_action.symbol.index < old_action.symbol.index;
}
void clear() {
main_lex_table.states.clear();
main_lex_state_ids.clear();
}
string token_name(const rules::Symbol &symbol) {
const LexicalVariable &variable = grammar.variables[symbol.index];
if (variable.type == VariableTypeNamed) {
return variable.name;
} else {
return "'" + variable.name + "'";
}
}
const char *log_char(int32_t character) {
uint32_t count = utf8proc_encode_char(
character,
reinterpret_cast<utf8proc_uint8_t *>(encoding_buffer)
);
encoding_buffer[count] = 0;
return encoding_buffer;
}
};
unique_ptr<LexTableBuilder> LexTableBuilder::create(const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const unordered_map<Symbol, LookaheadSet> &following_tokens,
const CoincidentTokenIndex &coincident_tokens,
ParseTable *parse_table) {
return unique_ptr<LexTableBuilder>(new LexTableBuilderImpl(
syntax_grammar,
lexical_grammar,
following_tokens,
coincident_tokens,
parse_table
));
}
LexTableBuilder::BuildResult LexTableBuilder::build() {
return static_cast<LexTableBuilderImpl *>(this)->build();
}
bool LexTableBuilder::does_token_shadow_other(Symbol a, Symbol b) const {
return static_cast<const LexTableBuilderImpl *>(this)->does_token_shadow_other(a, b);
}
bool LexTableBuilder::does_token_match_same_string_as_other(Symbol a, Symbol b) const {
return static_cast<const LexTableBuilderImpl *>(this)->does_token_match_same_string_as_other(a, b);
}
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::rules::Symbol;
size_t hash<pair<Symbol::Index, Symbol::Index>>::operator()(
const pair<Symbol::Index, Symbol::Index> &p
) const {
hash<Symbol::Index> hasher;
return hasher(p.first) ^ hasher(p.second);
}
} // namespace std

View file

@ -1,70 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
#define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
#include <memory>
#include <vector>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "compiler/parse_table.h"
#include "compiler/lex_table.h"
namespace std {
using tree_sitter::rules::Symbol;
template <>
struct hash<pair<Symbol::Index, Symbol::Index>> {
size_t operator()(const pair<Symbol::Index, Symbol::Index> &) const;
};
} // namespace std
namespace tree_sitter {
struct ParseTable;
struct SyntaxGrammar;
struct LexicalGrammar;
namespace build_tables {
class LookaheadSet;
struct CoincidentTokenIndex {
std::unordered_map<
std::pair<rules::Symbol::Index, rules::Symbol::Index>,
std::unordered_set<ParseStateId>
> entries;
bool contains(rules::Symbol, rules::Symbol) const;
const std::unordered_set<ParseStateId> &states_with(rules::Symbol, rules::Symbol) const;
};
class LexTableBuilder {
public:
static std::unique_ptr<LexTableBuilder> create(
const SyntaxGrammar &,
const LexicalGrammar &,
const std::unordered_map<rules::Symbol, LookaheadSet> &,
const CoincidentTokenIndex &,
ParseTable *
);
struct BuildResult {
LexTable main_table;
LexTable keyword_table;
rules::Symbol keyword_capture_token;
};
BuildResult build();
bool does_token_shadow_other(rules::Symbol, rules::Symbol) const;
bool does_token_match_same_string_as_other(rules::Symbol, rules::Symbol) const;
protected:
LexTableBuilder() = default;
};
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_

View file

@ -1,147 +0,0 @@
#include "compiler/build_tables/lookahead_set.h"
#include <set>
#include <memory>
#include "compiler/rule.h"
namespace tree_sitter {
namespace build_tables {
using std::vector;
using rules::Symbol;
LookaheadSet::LookaheadSet() {}
LookaheadSet::LookaheadSet(const vector<Symbol> &symbols) {
for (auto symbol : symbols) insert(symbol);
}
bool LookaheadSet::empty() const {
return terminal_bits.empty() && external_bits.empty() && !eof;
}
bool LookaheadSet::operator==(const LookaheadSet &other) const {
return
eof == other.eof &&
external_bits == other.external_bits &&
terminal_bits == other.terminal_bits;
}
bool LookaheadSet::contains(const Symbol &symbol) const {
if (symbol == rules::END_OF_INPUT()) return eof;
auto &bits = symbol.is_external() ? external_bits : terminal_bits;
return bits.size() > static_cast<size_t>(symbol.index) && bits[symbol.index];
}
bool LookaheadSet::intersects(const LookaheadSet &other) const {
bool result = false;
for_each([&](Symbol symbol) {
if (other.contains(symbol)) {
result = true;
return false;
}
return true;
});
return result;
}
size_t LookaheadSet::size() const {
size_t result = 0;
for (bool bit : external_bits) if (bit) result++;
for (bool bit : terminal_bits) if (bit) result++;
if (eof) result++;
return result;
}
bool LookaheadSet::insert_all(const LookaheadSet &other) {
bool result = false;
if (other.eof) {
if (!eof) {
eof = true;
result = true;
}
}
if (other.external_bits.size() > external_bits.size()) {
external_bits.resize(other.external_bits.size());
}
auto iter = external_bits.begin();
auto other_iter = other.external_bits.begin();
auto other_end = other.external_bits.end();
while (other_iter != other_end) {
if (*other_iter && !*iter) {
result = true;
*iter = true;
}
++iter;
++other_iter;
}
if (other.terminal_bits.size() > terminal_bits.size()) {
terminal_bits.resize(other.terminal_bits.size());
}
iter = terminal_bits.begin();
other_iter = other.terminal_bits.begin();
other_end = other.terminal_bits.end();
while (other_iter != other_end) {
if (*other_iter && !*iter) {
result = true;
*iter = true;
}
++iter;
++other_iter;
}
return result;
}
bool LookaheadSet::insert(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT()) {
if (!eof) {
eof = true;
return true;
}
return false;
}
auto &bits = symbol.is_external() ? external_bits : terminal_bits;
if (bits.size() <= static_cast<size_t>(symbol.index)) {
bits.resize(symbol.index + 1);
}
if (!bits[symbol.index]) {
bits[symbol.index] = true;
return true;
}
return false;
}
bool LookaheadSet::remove(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT()) {
if (eof) {
eof = false;
return true;
}
return false;
}
auto &bits = symbol.is_external() ? external_bits : terminal_bits;
if (bits.size() > static_cast<size_t>(symbol.index)) {
if (bits[symbol.index]) {
bits[symbol.index] = false;
return true;
}
}
return false;
}
void LookaheadSet::clear() {
eof = false;
terminal_bits.clear();
external_bits.clear();
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,115 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_LOOKAHEAD_SET_H_
#define COMPILER_BUILD_TABLES_LOOKAHEAD_SET_H_
#include <vector>
#include "compiler/rule.h"
namespace tree_sitter {
namespace build_tables {
class LookaheadSet {
std::vector<bool> terminal_bits;
std::vector<bool> external_bits;
bool eof = false;
public:
LookaheadSet();
explicit LookaheadSet(const std::vector<rules::Symbol> &);
bool empty() const;
size_t size() const;
bool operator==(const LookaheadSet &) const;
bool contains(const rules::Symbol &) const;
bool insert_all(const LookaheadSet &);
bool insert(const rules::Symbol &);
bool remove(const rules::Symbol &);
void clear();
bool intersects(const LookaheadSet &) const;
template <typename Callback>
void for_each(const Callback &callback) const {
for (auto begin = external_bits.begin(),
end = external_bits.end(),
iter = begin;
iter != end;
++iter) {
if (*iter) {
if (!callback(rules::Symbol::external(iter - begin))) return;
}
}
if (eof) {
if (!callback(rules::END_OF_INPUT())) return;
}
for (auto begin = terminal_bits.begin(),
end = terminal_bits.end(),
iter = begin;
iter != end;
++iter) {
if (*iter) {
if (!callback(rules::Symbol::terminal(iter - begin))) return;
}
}
}
template <typename Callback>
void for_each_difference(const LookaheadSet &other, const Callback &callback) const {
auto end = external_bits.end();
auto begin = external_bits.begin();
auto other_end = other.external_bits.end();
auto other_begin = other.external_bits.begin();
auto common_end = other.external_bits.size() < external_bits.size() ?
begin + other.external_bits.size() :
end;
auto iter = begin;
auto other_iter = other_begin;
for (; iter != common_end; ++iter, ++other_iter) {
if (*iter) {
if (!*other_iter && !callback(true, rules::Symbol::external(iter - begin))) return;
} else if (*other_iter) {
if (!callback(false, rules::Symbol::external(iter - begin))) return;
}
}
for (; iter < end; ++iter) {
if (*iter && !callback(true, rules::Symbol::external(iter - begin))) return;
}
for (; other_iter < other_end; ++other_iter) {
if (*other_iter && !callback(false, rules::Symbol::external(other_iter - other_begin))) return;
}
if (eof) {
if (!other.eof && !callback(true, rules::END_OF_INPUT())) return;
} else if (other.eof) {
if (!callback(false, rules::END_OF_INPUT())) return;
}
end = terminal_bits.end();
begin = terminal_bits.begin();
other_end = other.terminal_bits.end();
other_begin = other.terminal_bits.begin();
common_end = other.terminal_bits.size() < terminal_bits.size() ?
begin + other.terminal_bits.size() :
end;
iter = begin;
other_iter = other_begin;
for (; iter != common_end; ++iter, ++other_iter) {
if (*iter) {
if (!*other_iter && !callback(true, rules::Symbol::terminal(iter - begin))) return;
} else if (*other_iter) {
if (!callback(false, rules::Symbol::terminal(iter - begin))) return;
}
}
for (; iter < end; ++iter) {
if (*iter && !callback(true, rules::Symbol::terminal(iter - begin))) return;
}
for (; other_iter < other_end; ++other_iter) {
if (*other_iter && !callback(false, rules::Symbol::terminal(other_iter - other_begin))) return;
}
}
};
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_LOOKAHEAD_SET_H_

View file

@ -1,196 +0,0 @@
#include "compiler/build_tables/parse_item.h"
#include <string>
#include "compiler/syntax_grammar.h"
#include "compiler/rule.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace build_tables {
using std::map;
using std::pair;
using std::string;
using std::to_string;
using rules::Symbol;
using rules::Associativity;
using util::hash_combine;
ParseItem::ParseItem() : variable_index(-1), production(nullptr), step_index(0) {}
ParseItem::ParseItem(const Symbol &lhs, const Production &production,
unsigned int step_index)
: variable_index(lhs.index),
production(&production),
step_index(step_index) {}
bool ParseItem::operator==(const ParseItem &other) const {
if (step_index != other.step_index) return false;
if (variable_index != other.variable_index) return false;
if (production->size() != other.production->size()) return false;
for (size_t i = 0; i < step_index; i++) {
if (production->at(i).alias != other.production->at(i).alias) return false;
}
if (is_done()) {
if (!production->empty()) {
if (production->back().precedence != other.production->back().precedence) return false;
if (production->back().associativity != other.production->back().associativity) return false;
}
} else {
for (size_t i = step_index, n = production->size(); i < n; i++) {
if (production->at(i) != other.production->at(i)) return false;
}
}
return true;
}
bool ParseItem::operator<(const ParseItem &other) const {
if (step_index < other.step_index) return true;
if (other.step_index < step_index) return false;
if (variable_index < other.variable_index) return true;
if (other.variable_index < variable_index) return false;
if (production->size() < other.production->size()) return true;
if (other.production->size() < production->size()) return false;
for (size_t i = 0; i < step_index; i++) {
if (production->at(i).alias < other.production->at(i).alias) return true;
if (other.production->at(i).alias < production->at(i).alias) return false;
}
if (is_done()) {
if (!production->empty()) {
if (production->back().precedence < other.production->back().precedence) return true;
if (other.production->back().precedence < production->back().precedence) return false;
if (production->back().associativity < other.production->back().associativity) return true;
if (other.production->back().associativity < production->back().associativity) return false;
}
} else {
for (size_t i = step_index, n = production->size(); i < n; i++) {
if (production->at(i) < other.production->at(i)) return true;
if (other.production->at(i) < production->at(i)) return false;
}
}
return false;
}
Symbol ParseItem::lhs() const {
return Symbol{variable_index, Symbol::NonTerminal};
}
bool ParseItem::is_done() const {
return step_index >= production->size();
}
int ParseItem::precedence() const {
if (is_done()) {
if (production->empty()) {
return 0;
} else {
return production->back().precedence;
}
} else {
return production->at(step_index).precedence;
}
}
int ParseItem::dynamic_precedence() const {
return production->dynamic_precedence;
}
rules::Associativity ParseItem::associativity() const {
if (is_done()) {
if (production->empty()) {
return rules::AssociativityNone;
} else {
return production->back().associativity;
}
} else {
return production->at(step_index).associativity;
}
}
Symbol ParseItem::next_symbol() const {
if (step_index >= production->size())
return rules::NONE();
else
return production->at(step_index).symbol;
}
bool ParseItemSet::operator==(const ParseItemSet &other) const {
return entries == other.entries;
}
size_t ParseItemSet::unfinished_item_signature() const {
size_t result = 0;
ParseItem previous_item;
for (auto &pair : entries) {
const ParseItem &item = pair.first;
if (item.step_index < item.production->size() &&
(item.variable_index != previous_item.variable_index ||
item.step_index != previous_item.step_index)) {
hash_combine(&result, item.variable_index);
hash_combine(&result, item.step_index);
previous_item = item;
}
}
return result;
}
void ParseItemSet::add(const ParseItemSet &other) {
for (const auto &pair : other.entries)
entries[pair.first].insert_all(pair.second);
}
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::build_tables::ParseItem;
using tree_sitter::build_tables::ParseItemSet;
using tree_sitter::util::hash_combine;
template <>
struct hash<ParseItem> {
size_t operator()(const ParseItem &item) const {
size_t result = 0;
hash_combine(&result, item.variable_index);
hash_combine(&result, item.step_index);
hash_combine(&result, item.production->dynamic_precedence);
hash_combine(&result, item.production->size());
for (size_t i = 0; i < item.step_index; i++) {
hash_combine(&result, item.production->at(i).alias.value);
hash_combine(&result, item.production->at(i).alias.is_named);
}
if (item.is_done()) {
if (!item.production->empty()) {
hash_combine(&result, item.production->back().precedence);
hash_combine<unsigned>(&result, item.production->back().associativity);
}
} else {
for (size_t i = item.step_index, n = item.production->size(); i < n; i++) {
auto &step = item.production->at(i);
hash_combine(&result, step.symbol);
hash_combine(&result, step.precedence);
hash_combine<unsigned>(&result, step.associativity);
}
}
return result;
}
};
size_t hash<ParseItemSet>::operator()(const ParseItemSet &item_set) const {
size_t result = 0;
hash_combine(&result, item_set.entries.size());
for (auto &pair : item_set.entries) {
const ParseItem &item = pair.first;
const auto &lookahead_set = pair.second;
hash_combine(&result, item);
hash_combine(&result, lookahead_set.size());
lookahead_set.for_each([&result](Symbol symbol) {
hash_combine(&result, symbol);
return true;
});
}
return result;
}
} // namespace std

View file

@ -1,60 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_H_
#define COMPILER_BUILD_TABLES_PARSE_ITEM_H_
#include <map>
#include <utility>
#include "compiler/build_tables/lookahead_set.h"
#include "compiler/rule.h"
#include "compiler/syntax_grammar.h"
#include "compiler/precedence_range.h"
namespace tree_sitter {
namespace build_tables {
struct ParseItem {
ParseItem();
ParseItem(const rules::Symbol &, const Production &, unsigned int);
struct CompletionStatus {
bool is_done;
int precedence;
rules::Associativity associativity;
};
bool operator==(const ParseItem &other) const;
bool operator<(const ParseItem &other) const;
rules::Symbol lhs() const;
rules::Symbol next_symbol() const;
int precedence() const;
int dynamic_precedence() const;
rules::Associativity associativity() const;
bool is_done() const;
int variable_index;
const Production *production;
unsigned int step_index;
};
struct ParseItemSet {
bool operator==(const ParseItemSet &) const;
void add(const ParseItemSet &);
size_t unfinished_item_signature() const;
std::map<ParseItem, LookaheadSet> entries;
};
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::build_tables::ParseItemSet;
template <>
struct hash<tree_sitter::build_tables::ParseItemSet> {
size_t operator()(const ParseItemSet &item_set) const;
};
} // namespace std
#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_H_

View file

@ -1,302 +0,0 @@
#include "compiler/build_tables/parse_item_set_builder.h"
#include <algorithm>
#include <cassert>
#include <set>
#include <unordered_map>
#include <vector>
#include <utility>
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace build_tables {
using std::find;
using std::get;
using std::move;
using std::pair;
using std::set;
using std::unordered_map;
using std::vector;
using rules::Symbol;
struct FollowSetInfo {
LookaheadSet lookaheads;
bool propagates_lookaheads;
};
struct NonTerminalQueueEntry {
Symbol::Index non_terminal;
LookaheadSet lookaheads;
bool propagates_lookaheads;
};
bool ParseItemSetBuilder::ParseItemSetComponent::operator==(
const ParseItemSetBuilder::ParseItemSetComponent &other) const {
return item == other.item &&
lookaheads == other.lookaheads &&
propagates_lookaheads == other.propagates_lookaheads;
}
template <typename T>
inline void find_or_push(vector<T> &vector, const T &item) {
if (find(vector.begin(), vector.end(), item) == vector.end()) {
vector.push_back(item);
}
}
ParseItemSetBuilder::ParseItemSetBuilder(
const SyntaxGrammar &grammar,
const LexicalGrammar &lexical_grammar
) : grammar{grammar} {
// Populate the FIRST and LAST set of each terminal, which just contains the terminal itself.
for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
Symbol symbol = Symbol::terminal(i);
first_sets.insert({symbol, LookaheadSet({symbol})});
last_sets.insert({symbol, LookaheadSet({symbol})});
}
for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) {
Symbol symbol = Symbol::external(i);
first_sets.insert({symbol, LookaheadSet({symbol})});
last_sets.insert({symbol, LookaheadSet({symbol})});
}
// Populate the FIRST and LAST set of each non-terminal by recursively expanding non-terminals.
vector<Symbol> symbols_to_process;
set<Symbol::Index> processed_non_terminals;
for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
Symbol symbol = Symbol::non_terminal(i);
LookaheadSet &first_set = first_sets[symbol];
LookaheadSet &last_set = last_sets[symbol];
processed_non_terminals.clear();
symbols_to_process.assign({symbol});
while (!symbols_to_process.empty()) {
Symbol current_symbol = symbols_to_process.back();
symbols_to_process.pop_back();
if (!current_symbol.is_non_terminal()) {
first_set.insert(current_symbol);
} else if (processed_non_terminals.insert(current_symbol.index).second) {
for (const Production &production : grammar.variables[current_symbol.index].productions) {
if (!production.empty()) {
symbols_to_process.push_back(production[0].symbol);
}
}
}
}
processed_non_terminals.clear();
symbols_to_process.assign({symbol});
while (!symbols_to_process.empty()) {
Symbol current_symbol = symbols_to_process.back();
symbols_to_process.pop_back();
if (!current_symbol.is_non_terminal()) {
last_set.insert(current_symbol);
} else if (processed_non_terminals.insert(current_symbol.index).second) {
for (const Production &production : grammar.variables[current_symbol.index].productions) {
if (!production.empty()) {
symbols_to_process.push_back(production.back().symbol);
}
}
}
}
}
// Populate a cache of which ParseItems will be created when a given non-terminal is expanded.
vector<NonTerminalQueueEntry> non_terminal_queue;
for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {
// Compute the follow set of each *other* non-terminal that the current non-terminal can
// start with.
unordered_map<Symbol::Index, FollowSetInfo> follow_set_info_by_non_terminal;
non_terminal_queue.assign({{i, LookaheadSet(), true}});
while (!non_terminal_queue.empty()) {
NonTerminalQueueEntry queue_entry = non_terminal_queue.back();
non_terminal_queue.pop_back();
bool queue_entry_is_new;
auto &follow_set_info = follow_set_info_by_non_terminal[queue_entry.non_terminal];
if (queue_entry.propagates_lookaheads) {
queue_entry_is_new = !follow_set_info.propagates_lookaheads;
follow_set_info.propagates_lookaheads = true;
} else {
queue_entry_is_new = follow_set_info.lookaheads.insert_all(queue_entry.lookaheads);
}
if (queue_entry_is_new) {
for (const Production &production : grammar.variables[queue_entry.non_terminal].productions) {
if (production.empty()) continue;
Symbol next_symbol = production.at(0).symbol;
if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue;
LookaheadSet next_lookaheads;
bool propagates_lookaheads;
if (production.size() == 1) {
next_lookaheads = queue_entry.lookaheads;
propagates_lookaheads = queue_entry.propagates_lookaheads;
} else {
Symbol symbol_after_next = production.at(1).symbol;
next_lookaheads = first_sets.find(symbol_after_next)->second;
propagates_lookaheads = false;
}
non_terminal_queue.push_back({
next_symbol.index,
next_lookaheads,
propagates_lookaheads
});
}
}
}
// Use these follow sets to populate the cache of ParseItems for non-terminal `i`.
for (auto &pair : follow_set_info_by_non_terminal) {
Symbol non_terminal = Symbol::non_terminal(pair.first);
for (const Production &production : grammar.variables[non_terminal.index].productions) {
ParseItem item(non_terminal, production, 0);
if (grammar.variables_to_inline.count(item.next_symbol())) {
for (const Production &inlined_production : inline_production(item)) {
find_or_push(transitive_closure_component_cache[i], {
ParseItem(non_terminal, inlined_production, 0),
pair.second.lookaheads,
pair.second.propagates_lookaheads
});
}
} else if (!grammar.variables_to_inline.count(non_terminal)) {
find_or_push(transitive_closure_component_cache[i], {
item,
pair.second.lookaheads,
pair.second.propagates_lookaheads
});
}
}
}
}
}
const vector<Production> &ParseItemSetBuilder::inline_production(const ParseItem &item) {
vector<Production> &result = inlined_productions_by_original_production[item];
if (!result.empty()) return result;
auto &inlined_step = item.production->at(item.step_index);
vector<const Production *> productions_to_insert;
for (auto &production : grammar.variables[inlined_step.symbol.index].productions) {
productions_to_insert.push_back(&production);
}
for (auto iter = productions_to_insert.begin(); iter != productions_to_insert.end();) {
const Production *production = *iter;
if (!production->empty() && grammar.variables_to_inline.count(production->steps.front().symbol)) {
iter = productions_to_insert.erase(iter);
for (auto &inlined_production : inline_production(ParseItem(inlined_step.symbol, *production, 0))) {
iter = productions_to_insert.insert(iter, &inlined_production);
}
} else {
++iter;
}
}
for (const Production *production_to_insert : productions_to_insert) {
auto begin = item.production->steps.begin();
auto end = item.production->steps.end();
auto step = begin + item.step_index;
Production production({begin, step}, item.production->dynamic_precedence);
for (auto &step : *production_to_insert) {
production.steps.push_back(step);
if (!inlined_step.alias.value.empty()) {
production.steps.back().alias = inlined_step.alias;
}
}
if (!production.back().precedence) {
production.back().precedence = inlined_step.precedence;
}
if (!production.back().associativity) {
production.back().associativity = inlined_step.associativity;
}
production.steps.insert(
production.steps.end(),
step + 1,
end
);
if (find(result.begin(), result.end(), production) == result.end()) {
result.push_back(move(production));
}
}
return result;
}
void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) {
const ParseItem &item = iter->first;
const LookaheadSet &lookaheads = iter->second;
// Items whose `step_index` is 0 are not part of the item set's "kernel"; they have been
// added in previous iterations of this loop, and they don't need to be further processed.
if (item.lhs() == rules::START() || item.step_index > 0) {
// Kernel items whose next symbol is a non-terminal are expanded using the pre-computed
// parse item cache.
const Symbol &next_symbol = item.next_symbol();
if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) {
LookaheadSet next_lookaheads;
size_t next_step = item.step_index + 1;
if (next_step == item.production->size()) {
next_lookaheads = lookaheads;
} else {
Symbol symbol_after_next = item.production->at(next_step).symbol;
next_lookaheads = first_sets.find(symbol_after_next)->second;
}
for (const auto &component : transitive_closure_component_cache[next_symbol.index]) {
LookaheadSet &current_lookaheads = item_set->entries[component.item];
current_lookaheads.insert_all(component.lookaheads);
if (component.propagates_lookaheads) {
current_lookaheads.insert_all(next_lookaheads);
}
}
if (grammar.variables_to_inline.count(next_symbol)) {
for (const Production &inlined_production : inline_production(item)) {
item_set->entries.insert({
ParseItem(item.lhs(), inlined_production, item.step_index),
lookaheads
});
}
iter = item_set->entries.erase(iter);
continue;
}
}
}
if (grammar.variables_to_inline.count(item.lhs())) {
iter = item_set->entries.erase(iter);
continue;
}
++iter;
}
}
LookaheadSet ParseItemSetBuilder::get_first_set(const rules::Symbol &symbol) const {
return first_sets.find(symbol)->second;
}
LookaheadSet ParseItemSetBuilder::get_last_set(const rules::Symbol &symbol) const {
return last_sets.find(symbol)->second;
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,41 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_
#define COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_
#include "compiler/build_tables/parse_item.h"
#include "compiler/rule.h"
#include <map>
#include <vector>
namespace tree_sitter {
struct SyntaxGrammar;
struct LexicalGrammar;
namespace build_tables {
class ParseItemSetBuilder {
struct ParseItemSetComponent {
ParseItem item;
LookaheadSet lookaheads;
bool propagates_lookaheads;
bool operator==(const ParseItemSetComponent &) const;
};
const SyntaxGrammar &grammar;
std::map<rules::Symbol, LookaheadSet> first_sets;
std::map<rules::Symbol, LookaheadSet> last_sets;
std::map<rules::Symbol::Index, std::vector<ParseItemSetComponent>> transitive_closure_component_cache;
std::map<ParseItem, std::vector<Production>> inlined_productions_by_original_production;
const std::vector<Production> &inline_production(const ParseItem &);
public:
ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &);
void apply_transitive_closure(ParseItemSet *);
LookaheadSet get_first_set(const rules::Symbol &) const;
LookaheadSet get_last_set(const rules::Symbol &) const;
};
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_

View file

@ -1,960 +0,0 @@
#include "compiler/build_tables/parse_table_builder.h"
#include <algorithm>
#include <map>
#include <set>
#include <deque>
#include <cassert>
#include <string>
#include <unordered_map>
#include <utility>
#include "compiler/log.h"
#include "compiler/parse_table.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/parse_item_set_builder.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rule.h"
#include "compiler/build_tables/lex_table_builder.h"
namespace tree_sitter {
namespace build_tables {
using std::deque;
using std::find;
using std::vector;
using std::set;
using std::tuple;
using std::make_tuple;
using std::map;
using std::move;
using std::string;
using std::to_string;
using std::unique_ptr;
using std::unordered_map;
using rules::Associativity;
using rules::Symbol;
using rules::END_OF_INPUT;
using SymbolSequence = vector<Symbol>;
// When there are conflicts involving auxiliary nodes (repeats),
// this structure is used to find the non-auxiliary node(s) that
// had the auxliary node as a child.
struct AuxiliaryNodeInfo {
Symbol auxiliary_node;
vector<Symbol> parents;
};
struct ParseStateQueueEntry {
SymbolSequence preceding_symbols;
vector<AuxiliaryNodeInfo> auxiliary_node_info_list;
ParseItemSet item_set;
ParseStateId state_id;
};
class ParseTableBuilderImpl : public ParseTableBuilder {
const SyntaxGrammar grammar;
const LexicalGrammar lexical_grammar;
const std::unordered_map<rules::Symbol, rules::Alias> &simple_aliases;
unordered_map<ParseItemSet, ParseStateId> state_ids_by_item_set;
vector<const ParseItemSet *> item_sets_by_state_id;
deque<ParseStateQueueEntry> parse_state_queue;
ParseTable parse_table;
ParseItemSetBuilder item_set_builder;
unique_ptr<LexTableBuilder> lex_table_builder;
unordered_map<Symbol, LookaheadSet> following_tokens_by_token;
CoincidentTokenIndex coincident_token_index;
set<std::pair<Symbol, Symbol>> logged_conflict_tokens;
public:
ParseTableBuilderImpl(
const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const std::unordered_map<rules::Symbol, rules::Alias> &simple_aliases
) : grammar(syntax_grammar),
lexical_grammar(lexical_grammar),
simple_aliases(simple_aliases),
item_set_builder(syntax_grammar, lexical_grammar) {}
BuildResult build() {
// Ensure that the empty rename sequence has index 0.
parse_table.alias_sequences.push_back({});
// Ensure that the error state has index 0.
ParseStateId error_state_id = add_parse_state({}, {}, ParseItemSet{});
// Add the starting state.
Symbol start_symbol = Symbol::non_terminal(0);
Production start_production({{start_symbol, 0, rules::AssociativityNone, rules::Alias{}}}, 0);
add_parse_state({}, {}, ParseItemSet{{
{
ParseItem(rules::START(), start_production, 0),
LookaheadSet({END_OF_INPUT()}),
},
}});
CompileError error = process_part_state_queue();
if (error) return {
parse_table,
LexTable(),
LexTable(),
rules::NONE(),
error,
};
lex_table_builder = LexTableBuilder::create(
grammar,
lexical_grammar,
following_tokens_by_token,
coincident_token_index,
&parse_table
);
build_error_parse_state(error_state_id);
remove_precedence_values();
remove_duplicate_parse_states();
eliminate_unit_reductions();
populate_used_terminals();
auto lex_table_result = lex_table_builder->build();
return {
parse_table,
lex_table_result.main_table,
lex_table_result.keyword_table,
lex_table_result.keyword_capture_token,
CompileError::none()
};
}
private:
CompileError process_part_state_queue() {
while (!parse_state_queue.empty()) {
auto entry = parse_state_queue.front();
parse_state_queue.pop_front();
item_set_builder.apply_transitive_closure(&entry.item_set);
string conflict = add_actions(
move(entry.preceding_symbols),
move(entry.auxiliary_node_info_list),
move(entry.item_set),
entry.state_id
);
if (!conflict.empty()) {
return CompileError(TSCompileErrorTypeParseConflict, conflict);
}
}
return CompileError::none();
}
void build_error_parse_state(ParseStateId state_id) {
parse_table.states[state_id].terminal_entries.clear();
// First, identify the conflict-free tokens.
LookaheadSet conflict_free_tokens;
for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
Symbol token = Symbol::terminal(i);
bool conflicts_with_other_tokens = false;
for (unsigned j = 0; j < lexical_grammar.variables.size(); j++) {
Symbol other_token = Symbol::terminal(j);
if (!coincident_token_index.contains(token, other_token) &&
lex_table_builder->does_token_shadow_other(token, other_token)) {
conflicts_with_other_tokens = true;
break;
}
}
if (!conflicts_with_other_tokens) conflict_free_tokens.insert(token);
}
// Include in the error recover state all of the tokens that are either
// conflict-free themselves, or have no conflicts with any conflict-free
// tokens.
LOG_START("finding non-conflicting tokens for error recovery");
LookaheadSet tokens;
for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
Symbol token = Symbol::terminal(i);
if (conflict_free_tokens.contains(token)) {
LOG("include %s", symbol_name(token).c_str());
parse_table.add_terminal_action(state_id, token, ParseAction::Recover());
} else {
bool conflicts_with_other_tokens = false;
conflict_free_tokens.for_each([&](Symbol other_token) {
if (!coincident_token_index.contains(token, other_token) &&
lex_table_builder->does_token_shadow_other(token, other_token)) {
LOG(
"exclude %s: conflicts with %s",
symbol_name(token).c_str(),
symbol_name(other_token).c_str()
);
conflicts_with_other_tokens = true;
return false;
}
return true;
});
if (!conflicts_with_other_tokens) {
LOG("include %s", symbol_name(token).c_str());
parse_table.add_terminal_action(state_id, token, ParseAction::Recover());
}
}
}
LOG_END();
for (size_t i = 0; i < grammar.external_tokens.size(); i++) {
if (grammar.external_tokens[i].corresponding_internal_token == rules::NONE()) {
parse_table.states[state_id].terminal_entries[Symbol::external(i)].actions.push_back(ParseAction::Recover());
}
}
parse_table.add_terminal_action(state_id, END_OF_INPUT(), ParseAction::Recover());
}
ParseStateId add_parse_state(
SymbolSequence &&preceding_symbols,
const vector<AuxiliaryNodeInfo> &auxiliary_node_info_list,
const ParseItemSet &item_set
) {
ParseStateId new_state_id = parse_table.states.size();
auto insertion = state_ids_by_item_set.insert({move(item_set), new_state_id});
if (insertion.second) {
item_sets_by_state_id.push_back(&insertion.first->first);
parse_table.states.push_back(ParseState());
parse_state_queue.push_back({
move(preceding_symbols),
auxiliary_node_info_list,
insertion.first->first,
new_state_id
});
return new_state_id;
} else {
return insertion.first->second;
}
}
string add_actions(
SymbolSequence &&sequence,
vector<AuxiliaryNodeInfo> &&auxiliary_node_info_list,
ParseItemSet &&item_set,
ParseStateId state_id
) {
map<Symbol, ParseItemSet> terminal_successors;
map<Symbol::Index, ParseItemSet> nonterminal_successors;
set<Symbol> lookaheads_with_conflicts;
for (const auto &pair : item_set.entries) {
const ParseItem &item = pair.first;
const LookaheadSet &lookahead_symbols = pair.second;
// If the item is finished, immediately add a Reduce or Accept action to
// the parse table for each of its lookahead terminals.
if (item.is_done()) {
ParseAction action = item.lhs() == rules::START() ?
ParseAction::Accept() :
ParseAction::Reduce(
item.lhs(),
item.step_index,
item.precedence(),
item.production->dynamic_precedence,
item.associativity(),
get_alias_sequence_id(*item.production)
);
lookahead_symbols.for_each([&](Symbol lookahead) {
ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
// Only add the highest-precedence Reduce actions to the parse table.
// If other lower-precedence actions are possible, ignore them.
if (entry.actions.empty()) {
entry.actions.push_back(action);
} else {
ParseAction &existing_action = entry.actions[0];
if (existing_action.type == ParseActionTypeAccept) {
entry.actions.push_back(action);
} else {
if (action.precedence > existing_action.precedence) {
entry.actions.assign({action});
lookaheads_with_conflicts.erase(lookahead);
} else if (action.precedence == existing_action.precedence) {
entry.actions.push_back(action);
lookaheads_with_conflicts.insert(lookahead);
}
}
}
return true;
});
// If the item is unfinished, create a new item by advancing one symbol.
// Add that new item to a successor item set.
} else {
Symbol symbol = item.production->at(item.step_index).symbol;
ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);
if (symbol.is_non_terminal()) {
if (grammar.variables[symbol.index].type == VariableTypeAuxiliary) {
vector<Symbol> parents;
for (auto &item : item_set.entries) {
Symbol parent_symbol = item.first.lhs();
if (
item.first.next_symbol() == symbol &&
grammar.variables[parent_symbol.index].type != VariableTypeAuxiliary &&
!parent_symbol.is_built_in()
) {
parents.push_back(parent_symbol);
}
}
auxiliary_node_info_list.push_back({symbol, parents});
}
nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
} else {
terminal_successors[symbol].entries[new_item] = lookahead_symbols;
}
}
}
// Add a Shift action for each possible successor state. Shift actions for
// terminal lookaheads can conflict with Reduce actions added previously.
for (auto &pair : terminal_successors) {
Symbol lookahead = pair.first;
ParseItemSet &next_item_set = pair.second;
ParseStateId next_state_id = add_parse_state(
append_symbol(sequence, lookahead),
auxiliary_node_info_list,
next_item_set
);
if (!parse_table.states[state_id].terminal_entries[lookahead].actions.empty()) {
lookaheads_with_conflicts.insert(lookahead);
}
parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id));
}
// Add a Shift action for each non-terminal transition.
for (auto &pair : nonterminal_successors) {
Symbol lookahead = Symbol::non_terminal(pair.first);
ParseItemSet &next_item_set = pair.second;
ParseStateId next_state_id = add_parse_state(
append_symbol(sequence, lookahead),
auxiliary_node_info_list,
next_item_set
);
parse_table.set_nonterminal_action(state_id, lookahead.index, next_state_id);
}
for (Symbol lookahead : lookaheads_with_conflicts) {
string conflict = handle_conflict(lookahead, item_set, sequence, auxiliary_node_info_list, state_id);
if (!conflict.empty()) return conflict;
}
ParseAction shift_extra = ParseAction::ShiftExtra();
ParseState &state = parse_table.states[state_id];
for (const Symbol &extra_symbol : grammar.extra_tokens) {
if (!state.terminal_entries.count(extra_symbol) || state.has_shift_action()) {
parse_table.add_terminal_action(state_id, extra_symbol, shift_extra);
}
}
auto &terminals = state.terminal_entries;
for (auto iter = terminals.begin(), end = terminals.end(); iter != end; ++iter) {
if (iter->first.is_built_in() || iter->first.is_external()) continue;
for (auto other_iter = terminals.begin(); other_iter != iter; ++other_iter) {
if (other_iter->first.is_built_in() || other_iter->first.is_external()) continue;
coincident_token_index.entries[{
other_iter->first.index,
iter->first.index
}].insert(state_id);
}
}
return "";
}
void remove_precedence_values() {
for (ParseState &state : parse_table.states) {
for (auto &entry : state.terminal_entries) {
auto &actions = entry.second.actions;
for (ParseAction &action : actions) {
action.precedence = 0;
action.associativity = rules::AssociativityNone;
}
for (auto i = actions.begin(); i != actions.end();) {
bool erased = false;
for (auto j = actions.begin(); j != i; j++) {
if (*j == *i) {
actions.erase(i);
erased = true;
break;
}
}
if (!erased) {
++i;
}
}
}
}
}
void remove_duplicate_parse_states() {
LOG_START("removing duplicate parse states");
unordered_map<size_t, set<ParseStateId>> state_indices_by_signature;
for (auto &pair : state_ids_by_item_set) {
const ParseItemSet &item_set = pair.first;
ParseStateId state_id = pair.second;
state_indices_by_signature[item_set.unfinished_item_signature()].insert(state_id);
}
set<ParseStateId> deleted_states;
while (true) {
map<ParseStateId, ParseStateId> state_replacements;
for (auto &pair : state_indices_by_signature) {
auto &state_indices = pair.second;
for (auto i = state_indices.begin(), end = state_indices.end(); i != end;) {
for (ParseStateId j : state_indices) {
if (j == *i) {
++i;
break;
}
if (!state_replacements.count(j) && merge_parse_state(j, *i)) {
state_replacements.insert({*i, j});
deleted_states.insert(*i);
i = state_indices.erase(i);
break;
}
}
}
}
if (state_replacements.empty()) break;
for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) {
if (!state_replacements.count(i)) {
ParseState &state = parse_table.states[i];
state.each_referenced_state([&state_replacements](ParseStateId *state_index) {
auto replacement = state_replacements.find(*state_index);
if (replacement != state_replacements.end()) {
*state_index = replacement->second;
}
});
}
}
}
delete_parse_states(deleted_states);
}
void eliminate_unit_reductions() {
set<Symbol> aliased_symbols;
for (auto &variable : grammar.variables) {
for (auto &production : variable.productions) {
for (auto &step : production) {
if (!step.alias.value.empty()) {
aliased_symbols.insert(step.symbol);
}
}
}
}
// Find all the "unit reduction states" - states whose only actions are unit reductions,
// all of which reduce by the same symbol. Store the symbols along with the state indices.
unordered_map<ParseStateId, Symbol::Index> unit_reduction_states;
for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) {
ParseState &state = parse_table.states[i];
bool only_unit_reductions = true;
Symbol::Index unit_reduction_symbol = -1;
if (!state.nonterminal_entries.empty()) continue;
for (auto &entry : state.terminal_entries) {
for (ParseAction &action : entry.second.actions) {
if (action.extra) continue;
if (action.type == ParseActionTypeReduce &&
action.consumed_symbol_count == 1 &&
action.alias_sequence_id == 0 &&
!simple_aliases.count(action.symbol) &&
!aliased_symbols.count(action.symbol) &&
grammar.variables[action.symbol.index].type != VariableTypeNamed &&
(unit_reduction_symbol == -1 || unit_reduction_symbol == action.symbol.index)
) {
unit_reduction_symbol = action.symbol.index;
} else {
only_unit_reductions = false;
break;
}
}
if (!only_unit_reductions) break;
}
if (only_unit_reductions) unit_reduction_states[i] = unit_reduction_symbol;
}
// Update each parse state so that the parser never enters these "unit reduction states".
// If a shift action points to a unit reduction state, update it to point directly at
// the same state as the shift action that's associated with the unit reduction's
// non-terminal.
for (ParseState &state : parse_table.states) {
bool done = false;
while (!done) {
done = true;
state.each_referenced_state([&](ParseStateId *state_id) {
const auto &unit_reduction_entry = unit_reduction_states.find(*state_id);
if (unit_reduction_entry != unit_reduction_states.end()) {
auto entry_for_reduced_symbol = state.nonterminal_entries.find(unit_reduction_entry->second);
*state_id = entry_for_reduced_symbol->second;
done = false;
}
});
}
}
// Remove the unit reduction states from the parse table.
set<ParseStateId> states_to_delete;
for (auto &entry : unit_reduction_states) {
if (entry.first != 1) states_to_delete.insert(entry.first);
}
delete_parse_states(states_to_delete);
}
void populate_used_terminals() {
for (const ParseState &state : parse_table.states) {
for (auto &entry : state.terminal_entries) {
parse_table.symbols.insert(entry.first);
}
}
}
// Does this parse state already have the given set of actions, for some lookahead token?
static bool has_actions(const ParseState &state, const ParseTableEntry &entry) {
for (const auto &pair : state.terminal_entries)
if (pair.second.actions == entry.actions)
return true;
return false;
}
// Can we add the given entry into the given parse state without affecting
// the behavior of the parser for valid inputs?
bool can_add_entry_to_state(const ParseState &state, Symbol new_token, const ParseTableEntry &entry) {
// Only merge parse states by allowing existing reductions to happen
// with additional lookahead tokens. Do not alter parse states in ways
// that allow entirely new types of actions to happen.
if (entry.actions.back().type != ParseActionTypeReduce) return false;
if (!has_actions(state, entry)) return false;
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if (new_token.is_external()) return false;
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
for (const ExternalToken &external_token : grammar.external_tokens) {
if (external_token.corresponding_internal_token == new_token) return false;
}
// Do not add a token if it conflicts with an existing token.
if (!new_token.is_built_in()) {
for (const auto &entry : state.terminal_entries) {
if (lex_table_builder->does_token_shadow_other(new_token, entry.first) ||
lex_table_builder->does_token_match_same_string_as_other(new_token, entry.first)) {
LOG_IF(
logged_conflict_tokens.insert({entry.first, new_token}).second,
"cannot merge parse states due to token conflict: %s and %s",
symbol_name(entry.first).c_str(),
symbol_name(new_token).c_str()
);
return false;
}
}
}
return true;
}
// If the parse states at the given indices are mergeable, merge the second one
// into the first one.
bool merge_parse_state(size_t left_index, size_t right_index) {
ParseState &left_state = parse_table.states[left_index];
ParseState &right_state = parse_table.states[right_index];
if (left_state.nonterminal_entries != right_state.nonterminal_entries) return false;
for (auto &left_entry : left_state.terminal_entries) {
Symbol lookahead = left_entry.first;
const auto &right_entry = right_state.terminal_entries.find(lookahead);
if (right_entry == right_state.terminal_entries.end()) {
if (!can_add_entry_to_state(right_state, lookahead, left_entry.second)) return false;
} else {
if (right_entry->second.actions != left_entry.second.actions) return false;
}
}
set<Symbol> symbols_to_merge;
for (auto &right_entry : right_state.terminal_entries) {
Symbol lookahead = right_entry.first;
const auto &left_entry = left_state.terminal_entries.find(lookahead);
if (left_entry == left_state.terminal_entries.end()) {
if (!can_add_entry_to_state(left_state, lookahead, right_entry.second)) return false;
symbols_to_merge.insert(lookahead);
}
}
for (const Symbol &lookahead : symbols_to_merge) {
left_state.terminal_entries[lookahead] = right_state.terminal_entries[lookahead];
}
return true;
}
string handle_conflict(
Symbol lookahead,
const ParseItemSet &item_set,
const SymbolSequence &preceding_symbols,
const vector<AuxiliaryNodeInfo> &auxiliary_node_info_list,
ParseStateId state_id
) {
ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
bool considered_associativity = false;
int reduction_precedence = entry.actions.front().precedence;
PrecedenceRange shift_precedence;
set<ParseItem> conflicting_items;
for (auto &pair : item_set.entries) {
const ParseItem &item = pair.first;
if (item.is_done()) {
if (pair.second.contains(lookahead)) {
conflicting_items.insert(item);
}
} else if (item.step_index > 0) {
LookaheadSet first_set = item_set_builder.get_first_set(item.next_symbol());
if (first_set.contains(lookahead)) {
shift_precedence.add(item.production->at(item.step_index - 1).precedence);
conflicting_items.insert(item);
}
}
}
if (entry.actions.back().type == ParseActionTypeShift) {
Symbol symbol = conflicting_items.begin()->lhs();
if (symbol.is_non_terminal() && grammar.variables[symbol.index].type == VariableTypeAuxiliary) {
bool all_symbols_match = true;
for (const ParseItem &conflicting_item : conflicting_items) {
if (conflicting_item.lhs() != symbol) {
all_symbols_match = false;
break;
}
}
if (all_symbols_match) {
entry.actions.back().repetition = true;
return "";
}
}
// If the shift action has higher precedence, prefer it over any of the
// reduce actions.
if (shift_precedence.min > reduction_precedence ||
(shift_precedence.min == reduction_precedence &&
shift_precedence.max > reduction_precedence)) {
entry.actions.assign({entry.actions.back()});
}
// If the shift action has lower precedence, prefer the reduce actions.
else if (shift_precedence.max < reduction_precedence ||
(shift_precedence.max == reduction_precedence &&
shift_precedence.min < reduction_precedence)) {
entry.actions.pop_back();
for (auto item_iter = conflicting_items.begin(); item_iter != conflicting_items.end();) {
if (item_iter->is_done()) {
++item_iter;
} else {
item_iter = conflicting_items.erase(item_iter);
}
}
}
// If the shift action has the same precedence as the reduce actions,
// consider the reduce actions' associativity. If they are all left
// associative, prefer the reduce actions. If they are all right
// associative, prefer the shift.
else if (shift_precedence.min == reduction_precedence &&
shift_precedence.max == reduction_precedence) {
considered_associativity = true;
bool has_non_associative_reductions = false;
bool has_left_associative_reductions = false;
bool has_right_associative_reductions = false;
for (const ParseAction &action : entry.actions) {
if (action.type != ParseActionTypeReduce) break;
switch (action.associativity) {
case rules::AssociativityLeft:
has_left_associative_reductions = true;
break;
case rules::AssociativityRight:
has_right_associative_reductions = true;
break;
default:
has_non_associative_reductions = true;
break;
}
}
if (!has_non_associative_reductions) {
if (has_right_associative_reductions && !has_left_associative_reductions) {
entry.actions.assign({entry.actions.back()});
} else if (has_left_associative_reductions && !has_right_associative_reductions) {
entry.actions.pop_back();
}
}
}
}
if (entry.actions.size() == 1) return "";
set<Symbol> actual_conflict;
for (const ParseItem &item : conflicting_items) {
Symbol symbol = item.lhs();
if (grammar.variables[symbol.index].type == VariableTypeAuxiliary) {
bool found_auxiliary_node_info = false;
for (
auto iter = auxiliary_node_info_list.rbegin(),
end = auxiliary_node_info_list.rend();
iter != end;
++iter
) {
if (iter->auxiliary_node == symbol) {
found_auxiliary_node_info = true;
actual_conflict.insert(iter->parents.begin(), iter->parents.end());
break;
}
}
assert(found_auxiliary_node_info);
} else {
actual_conflict.insert(symbol);
}
}
for (const auto &expected_conflict : grammar.expected_conflicts) {
if (expected_conflict == actual_conflict) return "";
}
string description = "Unresolved conflict for symbol sequence:\n\n";
for (auto &symbol : preceding_symbols) {
description += " " + symbol_name(symbol);
}
const string dot = "\xE2\x80\xA2";
const string ellipsis = "\xE2\x80\xA6";
description += " " + dot + " " + symbol_name(lookahead) + " " + ellipsis;
description += "\n\n";
description += "Possible interpretations:\n\n";
size_t interpretation_count = 1;
for (const ParseItem &item : conflicting_items) {
description += " " + to_string(interpretation_count++) + ":";
for (size_t i = 0; i < preceding_symbols.size() - item.step_index; i++) {
description += " " + symbol_name(preceding_symbols[i]);
}
description += " (" + symbol_name(item.lhs());
for (size_t i = 0; i < item.production->size(); i++) {
if (i == item.step_index) {
description += " " + dot;
}
description += " " + symbol_name(item.production->at(i).symbol);
}
description += ")";
if (item.is_done()) {
description += " " + dot + " " + symbol_name(lookahead) + " " + ellipsis;
}
description += "\n";
}
description += "\nPossible resolutions:\n\n";
size_t resolution_count = 1;
if (actual_conflict.size() > 1) {
if (entry.actions.back().type == ParseActionTypeShift) {
description += " " + to_string(resolution_count++) + ": ";
description += "Specify a higher precedence in";
bool is_first = true;
for (Symbol conflict_symbol : actual_conflict) {
for (const ParseItem &parse_item : conflicting_items) {
if (parse_item.lhs() == conflict_symbol && !parse_item.is_done()) {
if (!is_first) description += " and";
description += " `" + symbol_name(conflict_symbol) + "`";
is_first = false;
break;
}
}
}
description += " than in the other rules.\n";
}
for (const ParseAction &action : entry.actions) {
if (action.type == ParseActionTypeReduce) {
description += " " + to_string(resolution_count++) + ": ";
description += "Specify a higher precedence in `";
description += symbol_name(action.symbol);
description += "` than in the other rules.\n";
}
}
}
if (considered_associativity) {
description += " " + to_string(resolution_count++) + ": ";
description += "Specify a left or right associativity in";
bool is_first = true;
for (const ParseAction &action : entry.actions) {
if (action.type == ParseActionTypeReduce) {
if (!is_first) description += " and";
description += " `" + symbol_name(action.symbol) + "`";
is_first = false;
}
}
description += "\n";
}
description += " " + to_string(resolution_count++) + ": ";
description += "Add a conflict for these rules:";
for (Symbol conflict_symbol : actual_conflict) {
description += " `" + symbol_name(conflict_symbol) + "`";
}
description += "\n";
return description;
}
void delete_parse_states(const set<ParseStateId> deleted_states) {
vector<ParseStateId> new_state_ids(parse_table.states.size());
size_t deleted_state_count = 0;
auto deleted_state_iter = deleted_states.begin();
for (ParseStateId i = 0; i < new_state_ids.size(); i++) {
while (deleted_state_iter != deleted_states.end() && *deleted_state_iter < i) {
deleted_state_count++;
deleted_state_iter++;
}
new_state_ids[i] = i - deleted_state_count;
}
ParseStateId original_state_index = 0;
auto iter = parse_table.states.begin();
while (iter != parse_table.states.end()) {
if (deleted_states.count(original_state_index)) {
iter = parse_table.states.erase(iter);
} else {
ParseState &state = *iter;
state.each_referenced_state([&new_state_ids](ParseStateId *state_index) {
*state_index = new_state_ids[*state_index];
});
++iter;
}
original_state_index++;
}
}
string symbol_name(const rules::Symbol &symbol) const {
if (symbol.is_built_in()) {
if (symbol == END_OF_INPUT())
return "END_OF_INPUT";
else
return "";
}
switch (symbol.type) {
case Symbol::Terminal: {
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
if (variable.type == VariableTypeNamed)
return variable.name;
else
return "'" + variable.name + "'";
}
case Symbol::NonTerminal: {
return grammar.variables[symbol.index].name;
}
case Symbol::External:
default: {
return grammar.external_tokens[symbol.index].name;
}
}
}
unsigned get_alias_sequence_id(const Production &production) {
bool has_alias = false;
AliasSequence alias_sequence;
for (unsigned i = 0, n = production.size(); i < n; i++) {
auto &step = production.at(i);
if (!step.alias.value.empty()) {
has_alias = true;
alias_sequence.resize(i + 1);
alias_sequence[i] = step.alias;
}
}
if (has_alias && production.size() > parse_table.max_alias_sequence_length) {
parse_table.max_alias_sequence_length = production.size();
}
auto begin = parse_table.alias_sequences.begin();
auto end = parse_table.alias_sequences.end();
auto iter = find(begin, end, alias_sequence);
if (iter != end) {
return iter - begin;
} else {
parse_table.alias_sequences.push_back(move(alias_sequence));
return parse_table.alias_sequences.size() - 1;
}
}
SymbolSequence append_symbol(const SymbolSequence &sequence, const Symbol &symbol) {
if (!sequence.empty()) {
const LookaheadSet &left_tokens = item_set_builder.get_last_set(sequence.back());
const LookaheadSet &right_tokens = item_set_builder.get_first_set(symbol);
if (!left_tokens.empty() && !right_tokens.empty()) {
left_tokens.for_each([&](Symbol left_symbol) {
if (left_symbol.is_terminal() && !left_symbol.is_built_in()) {
right_tokens.for_each([&](Symbol right_symbol) {
if (right_symbol.is_terminal() && !right_symbol.is_built_in()) {
following_tokens_by_token[left_symbol].insert(right_symbol);
}
return true;
});
}
return true;
});
}
}
SymbolSequence result(sequence.size() + 1);
result.assign(sequence.begin(), sequence.end());
result.push_back(symbol);
return result;
}
};
unique_ptr<ParseTableBuilder> ParseTableBuilder::create(
const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const std::unordered_map<rules::Symbol, rules::Alias> &simple_aliases
) {
return unique_ptr<ParseTableBuilder>(new ParseTableBuilderImpl(
syntax_grammar,
lexical_grammar,
simple_aliases
));
}
ParseTableBuilder::BuildResult ParseTableBuilder::build() {
return static_cast<ParseTableBuilderImpl *>(this)->build();
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,43 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_
#define COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_
#include <memory>
#include <unordered_map>
#include "compiler/parse_table.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
struct ParseTable;
struct LexTable;
struct SyntaxGrammar;
struct LexicalGrammar;
namespace build_tables {
class ParseTableBuilder {
public:
static std::unique_ptr<ParseTableBuilder> create(
const SyntaxGrammar &,
const LexicalGrammar &,
const std::unordered_map<rules::Symbol, rules::Alias> &
);
struct BuildResult {
ParseTable parse_table;
LexTable main_lex_table;
LexTable keyword_lex_table;
rules::Symbol keyword_capture_token;
CompileError error;
};
BuildResult build();
protected:
ParseTableBuilder() = default;
};
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_

View file

@ -1,447 +0,0 @@
#include <vector>
#include <deque>
#include <algorithm>
#include <map>
#include <unordered_map>
#include <set>
#include "compiler/property_sheet.h"
#include "compiler/property_table.h"
#include "compiler/build_tables/property_table_builder.h"
#include "compiler/util/hash_combine.h"
using std::deque;
using std::vector;
using std::pair;
using std::unordered_map;
using std::set;
using std::move;
using std::map;
namespace tree_sitter {
namespace build_tables {
// A position within a selector for a particular rule set.
// For example, in a selector like `a > b`, this might
// describe the state of having descended into an `a`,
// but not a `b`.
struct PropertyItem {
unsigned rule_id;
unsigned selector_id;
unsigned step_id;
bool operator==(const PropertyItem &other) const {
return
rule_id == other.rule_id &&
selector_id == other.selector_id &&
step_id == other.step_id;
}
bool operator<(const PropertyItem &other) const {
if (rule_id < other.rule_id) return true;
if (rule_id > other.rule_id) return false;
if (selector_id < other.selector_id) return true;
if (selector_id > other.selector_id) return false;
return step_id < other.step_id;
}
};
// A set of possible positions within different selectors.
// This directly represents a state of the property-matching
// state machine.
struct PropertyItemSet {
set<PropertyItem> entries;
bool operator==(const PropertyItemSet &other) const {
return entries == other.entries;
}
};
// A set of properties that matched via a certain selector.
// These are ordered according to the usual CSS rules:
// specificity, falling back to the order in the original sheet.
struct PropertySelectorMatch {
unsigned specificity;
unsigned rule_id;
unsigned selector_id;
const PropertySet *property_set;
bool operator<(const PropertySelectorMatch &other) const {
if (specificity < other.specificity) return true;
if (specificity > other.specificity) return false;
if (rule_id < other.rule_id) return true;
if (rule_id > other.rule_id) return false;
return selector_id < other.selector_id;
}
};
struct PropertyTransitionEntry {
PropertyTransition transition;
unsigned latest_matching_rule_id;
unsigned specificity() const {
return
(transition.index == -1 ? 0 : 1) +
(transition.text_pattern.empty() ? 0 : 1);
}
// When using the final state machine, the runtime library computes
// a node's property by descending from the root of the syntax
// tree to that node. For each ancestor node on the way, it should
// update its state using the *first* matching entry of the
// `transitions` list. Therefore, the order of the transitions
// must match the normal tie-breaking rules of CSS.
bool operator<(const PropertyTransitionEntry &other) const {
// If two transitions match different node types, they can't
// both match a given node, so their order is arbitrary.
if (transition.type < other.transition.type) return true;
if (transition.type > other.transition.type) return false;
if (transition.named && !other.transition.named) return true;
if (!transition.named && other.transition.named) return false;
// More specific transitions should be considered before less
// specific ones.
if (specificity() > other.specificity()) return true;
if (specificity() < other.specificity()) return false;
// If there are two transitions with a specificity tie (e.g. one
// with an `:nth-child` pseudo-class and a one with a `:text`
// pseudo-class), then the one whose matching properties appeared
// later in the cascade should be considered first.
return latest_matching_rule_id > other.latest_matching_rule_id;
}
};
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::util::hash_combine;
// PropertyItemSets must be hashed because in the process of building
// the table, we maintain a map of existing property item sets to
// state ids.
template <>
struct hash<tree_sitter::build_tables::PropertyItemSet> {
size_t operator()(const tree_sitter::build_tables::PropertyItemSet &item_set) const {
size_t result = 0;
hash_combine(&result, item_set.entries.size());
for (const auto &item : item_set.entries) {
hash_combine(&result, item.rule_id);
hash_combine(&result, item.selector_id);
hash_combine(&result, item.step_id);
}
return result;
}
};
// PropertyTransitions must be hashed because we represent state
// transitions as a map of PropertyTransitions to successor PropertyItemSets.
template <>
struct hash<tree_sitter::PropertyTransition> {
size_t operator()(const tree_sitter::PropertyTransition &transition) const {
size_t result = 0;
hash_combine(&result, transition.type);
hash_combine(&result, transition.named);
hash_combine(&result, transition.index);
hash_combine(&result, transition.text_pattern);
hash_combine(&result, transition.state_id);
return result;
}
};
// PropertySets must be hashed so that we can use a map to dedup them.
template <>
struct hash<tree_sitter::PropertySet> {
size_t operator()(const tree_sitter::PropertySet &set) const {
size_t result = 0;
hash_combine(&result, set.size());
for (const auto &pair : set) {
hash_combine(&result, pair.first);
hash_combine(&result, pair.second);
}
return result;
}
};
} // namespace std
namespace tree_sitter {
namespace build_tables {
typedef unsigned StateId;
typedef unsigned PropertySetId;
struct PropertyTableBuilder {
PropertySheet sheet;
PropertyTable result;
unordered_map<PropertyItemSet, StateId> ids_by_item_set;
unordered_map<PropertySet, PropertySetId> ids_by_property_set;
deque<pair<PropertyItemSet, StateId>> item_set_queue;
PropertyTableBuilder(const PropertySheet &sheet) : sheet(sheet) {}
PropertyTable build() {
PropertyItemSet start_item_set;
for (unsigned i = 0; i < sheet.size(); i++) {
PropertyRule &rule = sheet[i];
for (unsigned j = 0; j < rule.selectors.size(); j++) {
start_item_set.entries.insert(PropertyItem {i, j, 0});
}
}
add_state(start_item_set);
while (!item_set_queue.empty()) {
auto entry = item_set_queue.front();
PropertyItemSet item_set = move(entry.first);
StateId state_id = entry.second;
item_set_queue.pop_front();
populate_state(item_set, state_id);
}
remove_duplicate_states();
return result;
}
// Different item sets can actually produce the same state, so the
// states need to be explicitly deduped as a post-processing step.
void remove_duplicate_states() {
map<StateId, StateId> replacements;
while (true) {
map<StateId, StateId> duplicates;
for (StateId i = 0, size = result.states.size(); i < size; i++) {
for (StateId j = 0; j < i; j++) {
if (!duplicates.count(j) && result.states[j] == result.states[i]) {
duplicates.insert({ i, j });
break;
}
}
}
if (duplicates.empty()) break;
map<StateId, StateId> new_replacements;
for (StateId i = 0, size = result.states.size(); i < size; i++) {
StateId new_state_index = i;
auto duplicate = duplicates.find(i);
if (duplicate != duplicates.end()) {
new_state_index = duplicate->second;
}
size_t prior_removed = 0;
for (const auto &duplicate : duplicates) {
if (duplicate.first >= new_state_index) break;
prior_removed++;
}
new_state_index -= prior_removed;
new_replacements.insert({i, new_state_index});
replacements.insert({ i, new_state_index });
for (auto &replacement : replacements) {
if (replacement.second == i) {
replacement.second = new_state_index;
}
}
}
for (auto &state : result.states) {
for (auto &transition : state.transitions) {
auto new_replacement = new_replacements.find(transition.state_id);
if (new_replacement != new_replacements.end()) {
transition.state_id = new_replacement->second;
}
}
auto new_replacement = new_replacements.find(state.default_next_state_id);
if (new_replacement != new_replacements.end()) {
state.default_next_state_id = new_replacement->second;
}
}
for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) {
result.states.erase(result.states.begin() + i->first);
}
}
}
// Get the next part of the selector that needs to be matched for a given item.
// This returns null if the item has consumed its entire selector.
const PropertySelectorStep *next_step_for_item(const PropertyItem &item) {
const PropertySelector &selector = sheet[item.rule_id].selectors[item.selector_id];
if (item.step_id < selector.size()) {
return &selector[item.step_id];
} else {
return nullptr;
}
}
// Get the previous part of the selector that was matched for a given item.
// This returns null if the item has not consumed anything.
const PropertySelectorStep *prev_step_for_item(const PropertyItem &item) {
if (item.step_id > 0) {
return &sheet[item.rule_id].selectors[item.selector_id][item.step_id];
} else {
return nullptr;
}
}
unsigned specificity_for_selector(const PropertySelector &selector) {
unsigned result = selector.size();
for (const PropertySelectorStep &step : selector) {
if (step.index != -1) result++;
if (!step.text_pattern.empty()) result++;
}
return result;
}
// Check if the given state transition matches the given part of a selector.
bool step_matches_transition(const PropertySelectorStep &step, const PropertyTransition &transition) {
return
step.type == transition.type &&
step.named == transition.named &&
(step.index == transition.index || step.index == -1) &&
(step.text_pattern == transition.text_pattern || step.text_pattern.empty());
}
void populate_state(const PropertyItemSet &item_set, StateId state_id) {
unordered_map<PropertyTransition, PropertyItemSet> transitions;
vector<PropertySelectorMatch> selector_matches;
for (const PropertyItem &item : item_set.entries) {
const PropertySelectorStep *next_step = next_step_for_item(item);
// If this item has more elements to match for its selector, then
// there's a state transition for elements that match the next
// part of the selector.
if (next_step) {
transitions[PropertyTransition{
next_step->type,
next_step->named,
next_step->index,
next_step->text_pattern,
0
}] = PropertyItemSet();
}
// If the item has matched its entire selector, then the property set
// for the item's rule applies in this state.
else {
const PropertyRule &rule = sheet[item.rule_id];
selector_matches.push_back(PropertySelectorMatch {
specificity_for_selector(rule.selectors[item.selector_id]),
item.rule_id,
item.selector_id,
&rule.properties,
});
}
}
// For each element that follows an item in this set,
// compute the next item set after descending through that element.
vector<PropertyTransitionEntry> transition_list;
for (auto &pair : transitions) {
PropertyTransition transition = pair.first;
PropertyItemSet &next_item_set = pair.second;
unsigned latest_matching_rule_id = 0;
for (const PropertyItem &item : item_set.entries) {
const PropertySelectorStep *next_step = next_step_for_item(item);
const PropertySelectorStep *prev_step = prev_step_for_item(item);
if (next_step) {
// If the element matches the next part of the item, advance the
// item to the next part of its selector.
if (step_matches_transition(*next_step, transition)) {
PropertyItem next_item = item;
next_item.step_id++;
next_item_set.entries.insert(next_item);
// If the item is at the end of its selector, record its rule id
// so that it can be used when sorting the transitions.
if (!next_step_for_item(next_item) && next_item.rule_id > latest_matching_rule_id) {
latest_matching_rule_id = item.rule_id;
}
}
// If the element does not match, and the item is in the middle
// of an immediate child selector, then remove it from the
// next item set. Otherwise, keep it unchanged.
if (!prev_step || !prev_step->is_immediate) {
next_item_set.entries.insert(item);
}
}
}
transition.state_id = add_state(next_item_set);
transition_list.push_back(PropertyTransitionEntry {transition, latest_matching_rule_id});
}
std::sort(transition_list.begin(), transition_list.end());
for (auto &entry : transition_list) {
result.states[state_id].transitions.push_back(entry.transition);
}
// Compute the default successor item set - the item set that
// we should advance to if the next element doesn't match any
// of the next elements in the item set's selectors.
PropertyItemSet default_next_item_set;
for (const PropertyItem &item : item_set.entries) {
const PropertySelectorStep *next_step = next_step_for_item(item);
const PropertySelectorStep *prev_step = prev_step_for_item(item);
if (next_step && (!prev_step || !prev_step->is_immediate)) {
default_next_item_set.entries.insert(item);
}
}
StateId default_next_state_id = add_state(default_next_item_set);
result.states[state_id].default_next_state_id = default_next_state_id;
// Sort the matching property sets by ascending specificity and by
// their order in the sheet. This way, more specific selectors and later
// rules will override less specific selectors and earlier rules.
PropertySet properties;
std::sort(selector_matches.begin(), selector_matches.end());
for (auto &match : selector_matches) {
for (auto &pair : *match.property_set) {
properties[pair.first] = pair.second;
}
}
// Add the final property set to the deduped list.
result.states[state_id].property_set_id = add_property_set(properties);
}
StateId add_state(const PropertyItemSet &item_set) {
auto entry = ids_by_item_set.find(item_set);
if (entry == ids_by_item_set.end()) {
StateId id = result.states.size();
ids_by_item_set[item_set] = id;
result.states.push_back(PropertyState {});
item_set_queue.push_back({item_set, id});
return id;
} else {
return entry->second;
}
}
PropertySetId add_property_set(const PropertySet &property_set) {
auto entry = ids_by_property_set.find(property_set);
if (entry == ids_by_property_set.end()) {
PropertySetId id = result.property_sets.size();
ids_by_property_set[property_set] = id;
result.property_sets.push_back(property_set);
return id;
} else {
return entry->second;
}
}
};
PropertyTable build_property_table(const PropertySheet &sheet) {
return PropertyTableBuilder(sheet).build();
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,15 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_PROPERTY_TABLE_BUILDER_H_
#define COMPILER_BUILD_TABLES_PROPERTY_TABLE_BUILDER_H_
#include <memory>
#include "compiler/property_table.h"
namespace tree_sitter {
namespace build_tables {
PropertyTable build_property_table(const PropertySheet &);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_PROPERTY_TABLE_BUILDER_H_

Some files were not shown because too many files have changed in this diff Show more