Compare commits

..

1 commit

Author SHA1 Message Date
Amaan Qureshi
1a99bfd9ff
wow 2024-09-26 17:51:46 -04:00
532 changed files with 25306 additions and 51449 deletions

View file

@ -10,9 +10,6 @@ insert_final_newline = true
[*.rs] [*.rs]
indent_size = 4 indent_size = 4
[*.{zig,zon}]
indent_size = 4
[Makefile] [Makefile]
indent_style = tab indent_style = tab
indent_size = 8 indent_size = 8

1
.envrc
View file

@ -1 +0,0 @@
use flake

1
.gitattributes vendored
View file

@ -3,4 +3,5 @@
/lib/src/unicode/*.h linguist-vendored /lib/src/unicode/*.h linguist-vendored
/lib/src/unicode/LICENSE linguist-vendored /lib/src/unicode/LICENSE linguist-vendored
/cli/src/generate/prepare_grammar/*.json -diff
Cargo.lock -diff Cargo.lock -diff

15
.github/FUNDING.yml vendored
View file

@ -1,15 +0,0 @@
# These are supported funding model platforms
github: tree-sitter
patreon: # Replace with a single Patreon username
open_collective: tree-sitter # Replace with a single Open Collective username
ko_fi: amaanq
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
polar: # Replace with a single Polar username
buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
thanks_dev: # Replace with a single thanks.dev username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

View file

@ -1,6 +1,6 @@
name: Bug Report name: Bug Report
description: Report a problem description: Report a problem
type: Bug labels: [bug]
body: body:
- type: textarea - type: textarea
attributes: attributes:
@ -13,11 +13,9 @@ body:
attributes: attributes:
label: "Steps to reproduce" label: "Steps to reproduce"
placeholder: | placeholder: |
```sh
git clone --depth=1 https://github.com/tree-sitter/tree-sitter-ruby git clone --depth=1 https://github.com/tree-sitter/tree-sitter-ruby
cd tree-sitter-ruby cd tree-sitter-ruby
tree-sitter generate tree-sitter generate
```
validations: validations:
required: true required: true

View file

@ -1,6 +1,6 @@
name: Feature request name: Feature request
description: Request an enhancement description: Request an enhancement
type: Feature labels: [enhancement]
body: body:
- type: markdown - type: markdown
attributes: attributes:

View file

@ -1,25 +1,24 @@
name: Cache name: 'Cache'
description: "This action caches fixtures"
description: This action caches fixtures
outputs: outputs:
cache-hit: cache-hit:
description: Cache hit description: 'Cache hit'
value: ${{ steps.cache.outputs.cache-hit }} value: ${{ steps.cache_output.outputs.cache-hit }}
runs: runs:
using: composite using: "composite"
steps: steps:
- uses: actions/cache@v4 - uses: actions/cache@v4
id: cache id: cache_fixtures
with: with:
path: | path: |
test/fixtures/grammars test/fixtures/grammars
target/release/tree-sitter-*.wasm target/release/tree-sitter-*.wasm
key: fixtures-${{ join(matrix.*, '_') }}-${{ hashFiles( key: fixtures-${{ join(matrix.*, '_') }}-${{ hashFiles(
'crates/generate/src/**', 'cli/src/generate/**',
'lib/src/parser.h', 'script/generate-fixtures*',
'lib/src/array.h',
'lib/src/alloc.h',
'test/fixtures/grammars/*/**/src/*.c', 'test/fixtures/grammars/*/**/src/*.c',
'.github/actions/cache/action.yml') }} '.github/actions/cache/action.yml') }}
- run: echo "cache-hit=${{ steps.cache_fixtures.outputs.cache-hit }}" >> $GITHUB_OUTPUT
shell: bash
id: cache_output

View file

@ -4,8 +4,6 @@ updates:
directory: "/" directory: "/"
schedule: schedule:
interval: "weekly" interval: "weekly"
cooldown:
default-days: 3
commit-message: commit-message:
prefix: "build(deps)" prefix: "build(deps)"
labels: labels:
@ -14,16 +12,10 @@ updates:
groups: groups:
cargo: cargo:
patterns: ["*"] patterns: ["*"]
ignore:
- dependency-name: "*"
update-types: ["version-update:semver-major", "version-update:semver-minor"]
- package-ecosystem: "github-actions" - package-ecosystem: "github-actions"
directory: "/" directory: "/"
schedule: schedule:
interval: "weekly" interval: "weekly"
cooldown:
default-days: 3
commit-message: commit-message:
prefix: "ci" prefix: "ci"
labels: labels:
@ -32,22 +24,3 @@ updates:
groups: groups:
actions: actions:
patterns: ["*"] patterns: ["*"]
- package-ecosystem: "npm"
versioning-strategy: increase
directories:
- "/crates/npm"
- "/crates/eslint"
- "/lib/binding_web"
schedule:
interval: "weekly"
cooldown:
default-days: 3
commit-message:
prefix: "build(deps)"
labels:
- "dependencies"
- "npm"
groups:
npm:
patterns: ["*"]

View file

@ -1,29 +0,0 @@
module.exports = async ({ github, context }) => {
let target = context.payload.issue;
if (target) {
await github.rest.issues.update({
...context.repo,
issue_number: target.number,
state: "closed",
state_reason: "not_planned",
title: "[spam]",
body: "",
type: null,
});
} else {
target = context.payload.pull_request;
await github.rest.pulls.update({
...context.repo,
pull_number: target.number,
state: "closed",
title: "[spam]",
body: "",
});
}
await github.rest.issues.lock({
...context.repo,
issue_number: target.number,
lock_reason: "spam",
});
};

17
.github/scripts/cross.sh vendored Executable file
View file

@ -0,0 +1,17 @@
#!/bin/bash
# set -x
set -e
if [ "$BUILD_CMD" != "cross" ]; then
echo "cross.sh - is a helper to assist only in cross compiling environments" >&2
echo "To use this tool set the BUILD_CMD env var to the \"cross\" value" >&2
exit 111
fi
if [ -z "$CROSS_IMAGE" ]; then
echo "The CROSS_IMAGE env var should be provided" >&2
exit 111
fi
docker run --rm -v /home/runner:/home/runner -w "$PWD" "$CROSS_IMAGE" "$@"

19
.github/scripts/make.sh vendored Executable file
View file

@ -0,0 +1,19 @@
#!/bin/bash
# set -x
set -e
if [ "$BUILD_CMD" == "cross" ]; then
if [ -z "$CC" ]; then
echo "make.sh: CC is not set" >&2
exit 111
fi
if [ -z "$AR" ]; then
echo "make.sh: AR is not set" >&2
exit 111
fi
cross.sh make CC=$CC AR=$AR "$@"
else
make "$@"
fi

28
.github/scripts/tree-sitter.sh vendored Executable file
View file

@ -0,0 +1,28 @@
#!/bin/bash
# set -x
set -e
if [ -z "$ROOT" ]; then
echo "The ROOT env var should be set to absolute path of a repo root folder" >&2
exit 111
fi
if [ -z "$TARGET" ]; then
echo "The TARGET env var should be equal to a \`cargo build --target <TARGET>\` command value" >&2
exit 111
fi
tree_sitter="$ROOT"/target/"$TARGET"/release/tree-sitter
if [ "$BUILD_CMD" == "cross" ]; then
if [ -z "$CROSS_RUNNER" ]; then
echo "The CROSS_RUNNER env var should be set to a CARGO_TARGET_*_RUNNER env var value" >&2
echo "that is available in a docker image used by the cross tool under the hood" >&2
exit 111
fi
cross.sh $CROSS_RUNNER "$tree_sitter" "$@"
else
"$tree_sitter" "$@"
fi

View file

@ -1,25 +0,0 @@
module.exports = async ({ github, context, core }) => {
if (context.eventName !== 'pull_request') return;
const prNumber = context.payload.pull_request.number;
const owner = context.repo.owner;
const repo = context.repo.repo;
const { data: files } = await github.rest.pulls.listFiles({
owner,
repo,
pull_number: prNumber
});
const changedFiles = files.map(file => file.filename);
const wasmStdLibSrc = 'crates/language/wasm/';
const dirChanged = changedFiles.some(file => file.startsWith(wasmStdLibSrc));
if (!dirChanged) return;
const wasmStdLibHeader = 'lib/src/wasm/wasm-stdlib.h';
const requiredChanged = changedFiles.includes(wasmStdLibHeader);
if (!requiredChanged) core.setFailed(`Changes detected in ${wasmStdLibSrc} but ${wasmStdLibHeader} was not modified.`);
};

View file

@ -1,30 +1,27 @@
name: Backport Pull Request name: backport
on: on:
pull_request_target: pull_request_target:
types: [closed, labeled] types: [closed, labeled]
permissions:
contents: write
pull-requests: write
jobs: jobs:
backport: backport:
permissions:
contents: write
pull-requests: write
name: Backport Pull Request
if: github.event.pull_request.merged if: github.event.pull_request.merged
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Create app token - uses: actions/create-github-app-token@v1
uses: actions/create-github-app-token@v2
id: app-token id: app-token
with: with:
app-id: ${{ vars.BACKPORT_APP }} app-id: ${{ vars.BACKPORT_APP }}
private-key: ${{ secrets.BACKPORT_KEY }} private-key: ${{ secrets.BACKPORT_KEY }}
- name: Create backport PR - name: Create backport PR
uses: korthout/backport-action@v4 id: backport
uses: korthout/backport-action@v3
with: with:
pull_title: "${pull_title}" pull_title: "${pull_title}"
label_pattern: "^ci:backport ([^ ]+)$" label_pattern: "^ci:backport ([^ ]+)$"

View file

@ -1,30 +0,0 @@
name: Check Bindgen Output
on:
pull_request:
paths:
- lib/include/tree_sitter/api.h
- lib/binding_rust/bindings.rs
push:
branches: [master]
paths:
- lib/include/tree_sitter/api.h
- lib/binding_rust/bindings.rs
jobs:
check-bindgen:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up stable Rust toolchain
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
- name: Generate bindings
run: cargo xtask generate-bindings
- name: Check if the bindgen output changed
run: git diff --exit-code lib/binding_rust/bindings.rs

View file

@ -1,9 +1,14 @@
name: Build & Test name: Build & Test
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-D warnings"
CROSS_DEBUG: 1
on: on:
workflow_call: workflow_call:
inputs: inputs:
run-test: run_test:
default: true default: true
type: boolean type: boolean
@ -16,296 +21,168 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
platform: platform:
- linux-arm64 - linux-arm64 #
- linux-arm - linux-arm #
- linux-x64 - linux-x64 #
- linux-x86 - linux-x86 #
- linux-powerpc64 - linux-powerpc64 #
- windows-arm64 - windows-arm64 #
- windows-x64 - windows-x64 # <-- No C library build - requires an additional adapted Makefile for `cl.exe` compiler
- windows-x86 - windows-x86 # -- // --
- macos-arm64 - macos-arm64 #
- macos-x64 - macos-x64 #
- wasm32
include: include:
# When adding a new `target`: # When adding a new `target`:
# 1. Define a new platform alias above # 1. Define a new platform alias above
# 2. Add a new record to the matrix map in `crates/cli/npm/install.js` # 2. Add a new record to a matrix map in `cli/npm/install.js`
- { platform: linux-arm64 , target: aarch64-unknown-linux-gnu , os: ubuntu-24.04-arm } - { platform: linux-arm64 , target: aarch64-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { platform: linux-arm , target: armv7-unknown-linux-gnueabihf , os: ubuntu-24.04-arm } - { platform: linux-arm , target: arm-unknown-linux-gnueabi , os: ubuntu-latest , use-cross: true }
- { platform: linux-x64 , target: x86_64-unknown-linux-gnu , os: ubuntu-24.04 } - { platform: linux-x64 , target: x86_64-unknown-linux-gnu , os: ubuntu-20.04 , cli_features: wasm } #2272
- { platform: linux-x86 , target: i686-unknown-linux-gnu , os: ubuntu-24.04 } - { platform: linux-x86 , target: i686-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { platform: linux-powerpc64 , target: powerpc64-unknown-linux-gnu , os: ubuntu-24.04 } - { platform: linux-powerpc64 , target: powerpc64-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { platform: windows-arm64 , target: aarch64-pc-windows-msvc , os: windows-11-arm } - { platform: windows-arm64 , target: aarch64-pc-windows-msvc , os: windows-latest }
- { platform: windows-x64 , target: x86_64-pc-windows-msvc , os: windows-2025 } - { platform: windows-x64 , target: x86_64-pc-windows-msvc , os: windows-latest , cli_features: wasm }
- { platform: windows-x86 , target: i686-pc-windows-msvc , os: windows-2025 } - { platform: windows-x86 , target: i686-pc-windows-msvc , os: windows-latest }
- { platform: macos-arm64 , target: aarch64-apple-darwin , os: macos-15 } - { platform: macos-arm64 , target: aarch64-apple-darwin , os: macos-14 , cli_features: wasm }
- { platform: macos-x64 , target: x86_64-apple-darwin , os: macos-15-intel } - { platform: macos-x64 , target: x86_64-apple-darwin , os: macos-12 , cli_features: wasm }
- { platform: wasm32 , target: wasm32-unknown-unknown , os: ubuntu-24.04 }
# Extra features # Cross compilers for C library
- { platform: linux-arm64 , features: wasm } - { platform: linux-arm64 , cc: aarch64-linux-gnu-gcc , ar: aarch64-linux-gnu-ar }
- { platform: linux-x64 , features: wasm } - { platform: linux-arm , cc: arm-linux-gnueabi-gcc , ar: arm-linux-gnueabi-ar }
- { platform: macos-arm64 , features: wasm } - { platform: linux-x86 , cc: i686-linux-gnu-gcc , ar: i686-linux-gnu-ar }
- { platform: macos-x64 , features: wasm } - { platform: linux-powerpc64 , cc: powerpc64-linux-gnu-gcc , ar: powerpc64-linux-gnu-ar }
# Cross-compilation # See #2041 tree-sitter issue
- { platform: linux-arm , cross: true } - { platform: windows-x64 , rust-test-threads: 1 }
- { platform: linux-x86 , cross: true } - { platform: windows-x86 , rust-test-threads: 1 }
- { platform: linux-powerpc64 , cross: true }
# Compile-only # CLI only build
- { platform: wasm32 , no-run: true } - { platform: windows-arm64 , cli-only: true }
env: env:
CARGO_TERM_COLOR: always BUILD_CMD: cargo
RUSTFLAGS: -D warnings EXE: ${{ contains(matrix.target, 'windows') && '.exe' || '' }}
defaults: defaults:
run: run:
shell: bash shell: bash
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set up cross-compilation - name: Read Emscripten version
if: matrix.cross run: echo "EMSCRIPTEN_VERSION=$(cat cli/loader/emscripten-version)" >> $GITHUB_ENV
run: |
for target in armv7-unknown-linux-gnueabihf i686-unknown-linux-gnu powerpc64-unknown-linux-gnu; do
camel_target=${target//-/_}; target_cc=${target/-unknown/}
printf 'CC_%s=%s\n' "$camel_target" "${target_cc/v7/}-gcc"
printf 'AR_%s=%s\n' "$camel_target" "${target_cc/v7/}-ar"
printf 'CARGO_TARGET_%s_LINKER=%s\n' "${camel_target^^}" "${target_cc/v7/}-gcc"
done >> $GITHUB_ENV
{
printf 'CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER=qemu-arm -L /usr/arm-linux-gnueabihf\n'
printf 'CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64 -L /usr/powerpc64-linux-gnu\n'
} >> $GITHUB_ENV
- name: Get emscripten version
if: contains(matrix.features, 'wasm')
run: printf 'EMSCRIPTEN_VERSION=%s\n' "$(<crates/loader/emscripten-version)" >> $GITHUB_ENV
- name: Install Emscripten - name: Install Emscripten
if: contains(matrix.features, 'wasm') if: ${{ !matrix.cli-only && !matrix.use-cross }}
uses: mymindstorm/setup-emsdk@v14 uses: mymindstorm/setup-emsdk@v14
with: with:
version: ${{ env.EMSCRIPTEN_VERSION }} version: ${{ env.EMSCRIPTEN_VERSION }}
- name: Set up Rust - run: rustup toolchain install stable --profile minimal
uses: actions-rust-lang/setup-rust-toolchain@v1 - run: rustup target add ${{ matrix.target }}
- uses: Swatinem/rust-cache@v2
- name: Install cross
if: ${{ matrix.use-cross }}
uses: taiki-e/install-action@v2
with: with:
target: ${{ matrix.target }} tool: cross
- name: Install cross-compilation toolchain - name: Build custom cross image
if: matrix.cross if: ${{ matrix.use-cross && matrix.os == 'ubuntu-latest' }}
run: | run: |
sudo apt-get update -qy target="${{ matrix.target }}"
if [[ $PLATFORM == linux-arm ]]; then image=ghcr.io/cross-rs/$target:custom
sudo apt-get install -qy {binutils,gcc}-arm-linux-gnueabihf qemu-user echo "CROSS_IMAGE=$image" >> $GITHUB_ENV
elif [[ $PLATFORM == linux-x86 ]]; then
sudo apt-get install -qy {binutils,gcc}-i686-linux-gnu echo "[target.$target]" >> Cross.toml
elif [[ $PLATFORM == linux-powerpc64 ]]; then echo "image = \"$image\"" >> Cross.toml
sudo apt-get install -qy {binutils,gcc}-powerpc64-linux-gnu qemu-user echo "CROSS_CONFIG=$PWD/Cross.toml" >> $GITHUB_ENV
echo "FROM ghcr.io/cross-rs/$target:edge" >> Dockerfile
echo "RUN curl -fsSL https://deb.nodesource.com/setup_16.x | bash -" >> Dockerfile
echo "RUN apt-get update && apt-get -y install nodejs" >> Dockerfile
docker build -t $image .
- name: Setup env extras
env:
RUST_TEST_THREADS: ${{ matrix.rust-test-threads }}
USE_CROSS: ${{ matrix.use-cross }}
TARGET: ${{ matrix.target }}
CC: ${{ matrix.cc }}
AR: ${{ matrix.ar }}
run: |
PATH="$PWD/.github/scripts:$PATH"
echo "$PWD/.github/scripts" >> $GITHUB_PATH
echo "TREE_SITTER=tree-sitter.sh" >> $GITHUB_ENV
echo "TARGET=$TARGET" >> $GITHUB_ENV
echo "ROOT=$PWD" >> $GITHUB_ENV
[ -n "$RUST_TEST_THREADS" ] && \
echo "RUST_TEST_THREADS=$RUST_TEST_THREADS" >> $GITHUB_ENV
[ -n "$CC" ] && echo "CC=$CC" >> $GITHUB_ENV
[ -n "$AR" ] && echo "AR=$AR" >> $GITHUB_ENV
if [ "$USE_CROSS" == "true" ]; then
echo "BUILD_CMD=cross" >> $GITHUB_ENV
runner=$(BUILD_CMD=cross cross.sh bash -c "env | sed -nr '/^CARGO_TARGET_.*_RUNNER=/s///p'")
[ -n "$runner" ] && echo "CROSS_RUNNER=$runner" >> $GITHUB_ENV
fi fi
env:
PLATFORM: ${{ matrix.platform }}
- name: Install MinGW and Clang (Windows x64 MSYS2) - name: Build C library
if: matrix.platform == 'windows-x64' if: ${{ !contains(matrix.os, 'windows') }} # Requires an additional adapted Makefile for `cl.exe` compiler
uses: msys2/setup-msys2@v2 run: make.sh -j CFLAGS="-Werror"
with:
update: true
install: |
mingw-w64-x86_64-toolchain
mingw-w64-x86_64-clang
mingw-w64-x86_64-make
mingw-w64-x86_64-cmake
# TODO: Remove RUSTFLAGS="--cap-lints allow" once we use a wasmtime release that addresses - name: Build wasm library
# the `mismatched-lifetime-syntaxes` lint if: ${{ !matrix.cli-only && !matrix.use-cross }} # No sense to build on the same Github runner hosts many times
- name: Build wasmtime library (Windows x64 MSYS2) run: script/build-wasm
if: contains(matrix.features, 'wasm') && matrix.platform == 'windows-x64'
run: |
mkdir -p target
WASMTIME_VERSION=$(cargo metadata --format-version=1 --locked --features wasm | \
jq -r '.packages[] | select(.name == "wasmtime-c-api-impl") | .version')
curl -LSs "$WASMTIME_REPO/archive/refs/tags/v${WASMTIME_VERSION}.tar.gz" | tar xzf - -C target
cd target/wasmtime-${WASMTIME_VERSION}
cmake -S crates/c-api -B target/c-api \
-DCMAKE_INSTALL_PREFIX="$PWD/artifacts" \
-DWASMTIME_DISABLE_ALL_FEATURES=ON \
-DWASMTIME_FEATURE_CRANELIFT=ON \
-DWASMTIME_TARGET='x86_64-pc-windows-gnu'
cmake --build target/c-api && cmake --install target/c-api
printf 'CMAKE_PREFIX_PATH=%s\n' "$PWD/artifacts" >> $GITHUB_ENV
env:
WASMTIME_REPO: https://github.com/bytecodealliance/wasmtime
RUSTFLAGS: ${{ env.RUSTFLAGS }} --cap-lints allow
- name: Build C library (Windows x64 MSYS2 CMake) - run: $BUILD_CMD build --release --target=${{ matrix.target }} --features=${{ matrix.cli_features }}
if: matrix.platform == 'windows-x64'
shell: msys2 {0}
run: |
cmake -G Ninja -S . -B build/static \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM \
-DCMAKE_C_COMPILER=clang
cmake --build build/static
cmake -G Ninja -S . -B build/shared \ - run: script/fetch-fixtures
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM \
-DCMAKE_C_COMPILER=clang
cmake --build build/shared
rm -rf \
build/{static,shared} \
"${CMAKE_PREFIX_PATH}/artifacts" \
target/wasmtime-${WASMTIME_VERSION}
env:
WASM: ${{ contains(matrix.features, 'wasm') && 'ON' || 'OFF' }}
# TODO: Remove RUSTFLAGS="--cap-lints allow" once we use a wasmtime release that addresses - uses: ./.github/actions/cache
# the `mismatched-lifetime-syntaxes` lint
- name: Build wasmtime library
if: contains(matrix.features, 'wasm')
run: |
mkdir -p target
WASMTIME_VERSION=$(cargo metadata --format-version=1 --locked --features wasm | \
jq -r '.packages[] | select(.name == "wasmtime-c-api-impl") | .version')
curl -LSs "$WASMTIME_REPO/archive/refs/tags/v${WASMTIME_VERSION}.tar.gz" | tar xzf - -C target
cd target/wasmtime-${WASMTIME_VERSION}
cmake -S crates/c-api -B target/c-api \
-DCMAKE_INSTALL_PREFIX="$PWD/artifacts" \
-DWASMTIME_DISABLE_ALL_FEATURES=ON \
-DWASMTIME_FEATURE_CRANELIFT=ON \
-DWASMTIME_TARGET='${{ matrix.target }}'
cmake --build target/c-api && cmake --install target/c-api
printf 'CMAKE_PREFIX_PATH=%s\n' "$PWD/artifacts" >> $GITHUB_ENV
env:
WASMTIME_REPO: https://github.com/bytecodealliance/wasmtime
RUSTFLAGS: ${{ env.RUSTFLAGS }} --cap-lints allow
- name: Build C library (make)
if: runner.os != 'Windows'
run: |
if [[ $PLATFORM == linux-arm ]]; then
CC=arm-linux-gnueabihf-gcc; AR=arm-linux-gnueabihf-ar
elif [[ $PLATFORM == linux-x86 ]]; then
CC=i686-linux-gnu-gcc; AR=i686-linux-gnu-ar
elif [[ $PLATFORM == linux-powerpc64 ]]; then
CC=powerpc64-linux-gnu-gcc; AR=powerpc64-linux-gnu-ar
else
CC=gcc; AR=ar
fi
make -j CFLAGS="$CFLAGS" CC=$CC AR=$AR
env:
PLATFORM: ${{ matrix.platform }}
CFLAGS: -g -Werror -Wall -Wextra -Wshadow -Wpedantic -Werror=incompatible-pointer-types
- name: Build C library (CMake)
if: "!matrix.cross"
run: |
cmake -S . -B build/static \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM
cmake --build build/static --verbose
cmake -S . -B build/shared \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM
cmake --build build/shared --verbose
env:
CC: ${{ contains(matrix.platform, 'linux') && 'clang' || '' }}
WASM: ${{ contains(matrix.features, 'wasm') && 'ON' || 'OFF' }}
- name: Build Wasm library
if: contains(matrix.features, 'wasm')
shell: bash
run: |
cd lib/binding_web
npm ci
CJS=true npm run build
CJS=true npm run build:debug
npm run build
npm run build:debug
- name: Check no_std builds
if: inputs.run-test && !matrix.no-run
working-directory: lib
shell: bash
run: cargo check --no-default-features --target='${{ matrix.target }}'
- name: Build target
run: cargo build --release --target='${{ matrix.target }}' --features='${{ matrix.features }}' $PACKAGE
env:
PACKAGE: ${{ matrix.platform == 'wasm32' && '-p tree-sitter' || '' }}
- name: Cache fixtures
id: cache id: cache
if: inputs.run-test && !matrix.no-run
uses: ./.github/actions/cache
- name: Fetch fixtures
if: inputs.run-test && !matrix.no-run
run: cargo run -p xtask --target='${{ matrix.target }}' -- fetch-fixtures
- name: Generate fixtures - name: Generate fixtures
if: inputs.run-test && !matrix.no-run && steps.cache.outputs.cache-hit != 'true' if: ${{ !matrix.cli-only && inputs.run_test && steps.cache.outputs.cache-hit != 'true' }} # Can't natively run CLI on Github runner's host
run: cargo run -p xtask --target='${{ matrix.target }}' -- generate-fixtures run: script/generate-fixtures
- name: Generate Wasm fixtures - name: Generate WASM fixtures
if: inputs.run-test && !matrix.no-run && contains(matrix.features, 'wasm') && steps.cache.outputs.cache-hit != 'true' if: ${{ !matrix.cli-only && !matrix.use-cross && inputs.run_test && steps.cache.outputs.cache-hit != 'true' }} # See comment for the "Build wasm library" step
run: cargo run -p xtask --target='${{ matrix.target }}' -- generate-fixtures --wasm run: script/generate-fixtures-wasm
- name: Run main tests - name: Run main tests
if: inputs.run-test && !matrix.no-run if: ${{ !matrix.cli-only && inputs.run_test }} # Can't natively run CLI on Github runner's host
run: cargo test --target='${{ matrix.target }}' --features='${{ matrix.features }}' run: $BUILD_CMD test --target=${{ matrix.target }} --features=${{ matrix.cli_features }}
- name: Run Wasm tests - name: Run wasm tests
if: inputs.run-test && !matrix.no-run && contains(matrix.features, 'wasm') if: ${{ !matrix.cli-only && !matrix.use-cross && inputs.run_test }} # See comment for the "Build wasm library" step
run: cargo run -p xtask --target='${{ matrix.target }}' -- test-wasm run: script/test-wasm
- name: Run benchmarks
if: ${{ !matrix.cli-only && !matrix.use-cross && inputs.run_test }} # Cross-compiled benchmarks make no sense
run: $BUILD_CMD bench benchmark -p tree-sitter-cli --target=${{ matrix.target }}
- name: Upload CLI artifact - name: Upload CLI artifact
if: "!matrix.no-run" uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v6
with: with:
name: tree-sitter.${{ matrix.platform }} name: tree-sitter.${{ matrix.platform }}
path: target/${{ matrix.target }}/release/tree-sitter${{ contains(matrix.target, 'windows') && '.exe' || '' }} path: target/${{ matrix.target }}/release/tree-sitter${{ env.EXE }}
if-no-files-found: error if-no-files-found: error
retention-days: 7 retention-days: 7
- name: Upload Wasm artifacts - name: Upload WASM artifacts
if: matrix.platform == 'linux-x64' if: ${{ matrix.platform == 'linux-x64' }}
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: tree-sitter.wasm name: tree-sitter.wasm
path: | path: |
lib/binding_web/web-tree-sitter.js lib/binding_web/tree-sitter.js
lib/binding_web/web-tree-sitter.js.map lib/binding_web/tree-sitter.wasm
lib/binding_web/web-tree-sitter.cjs
lib/binding_web/web-tree-sitter.cjs.map
lib/binding_web/web-tree-sitter.wasm
lib/binding_web/web-tree-sitter.wasm.map
lib/binding_web/debug/web-tree-sitter.cjs
lib/binding_web/debug/web-tree-sitter.cjs.map
lib/binding_web/debug/web-tree-sitter.js
lib/binding_web/debug/web-tree-sitter.js.map
lib/binding_web/debug/web-tree-sitter.wasm
lib/binding_web/debug/web-tree-sitter.wasm.map
lib/binding_web/lib/*.c
lib/binding_web/lib/*.h
lib/binding_web/lib/*.ts
lib/binding_web/src/*.ts
if-no-files-found: error if-no-files-found: error
retention-days: 7 retention-days: 7

View file

@ -1,21 +1,9 @@
name: CI name: CI
on: on:
pull_request: pull_request:
paths-ignore:
- docs/**
- "**/README.md"
- CONTRIBUTING.md
- LICENSE
- cli/src/templates
push: push:
branches: [master] branches:
paths-ignore: - 'master'
- docs/**
- "**/README.md"
- CONTRIBUTING.md
- LICENSE
- cli/src/templates
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.ref }}
@ -25,25 +13,15 @@ jobs:
checks: checks:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6 - run: rustup toolchain install stable --profile minimal
- run: rustup toolchain install nightly --profile minimal
- name: Set up stable Rust toolchain - run: rustup component add --toolchain nightly rustfmt
uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2
with: - run: make lint
toolchain: stable
components: clippy, rustfmt
- name: Lint files
run: |
make lint
make lint-web
sanitize: sanitize:
uses: ./.github/workflows/sanitize.yml uses: ./.github/workflows/sanitize.yml
build: build:
uses: ./.github/workflows/build.yml uses: ./.github/workflows/build.yml
check-wasm-stdlib:
uses: ./.github/workflows/wasm_stdlib.yml

View file

@ -1,50 +0,0 @@
name: Deploy Docs
on:
push:
branches: [master]
paths: [docs/**]
workflow_dispatch:
jobs:
deploy-docs:
runs-on: ubuntu-latest
permissions:
contents: write
pages: write
id-token: write
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up Rust
uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Install mdbook
env:
GH_TOKEN: ${{ github.token }}
run: |
jq_expr='.assets[] | select(.name | contains("x86_64-unknown-linux-gnu")) | .browser_download_url'
url=$(gh api repos/rust-lang/mdbook/releases/tags/v0.4.52 --jq "$jq_expr")
mkdir mdbook
curl -sSL "$url" | tar -xz -C mdbook
printf '%s/mdbook\n' "$PWD" >> "$GITHUB_PATH"
- name: Install mdbook-admonish
run: cargo install mdbook-admonish
- name: Build Book
run: mdbook build docs
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v4
with:
path: docs/book
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

View file

@ -1,69 +0,0 @@
name: nvim-treesitter parser tests
on:
pull_request:
paths:
- 'crates/cli/**'
- 'crates/config/**'
- 'crates/generate/**'
- 'crates/loader/**'
- '.github/workflows/nvim_ts.yml'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check_compilation:
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
type: [generate, build]
name: ${{ matrix.os }} - ${{ matrix.type }}
runs-on: ${{ matrix.os }}
env:
NVIM: ${{ matrix.os == 'windows-latest' && 'nvim-win64\\bin\\nvim.exe' || 'nvim' }}
NVIM_TS_DIR: nvim-treesitter
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@v6
with:
repository: nvim-treesitter/nvim-treesitter
path: ${{ env.NVIM_TS_DIR }}
ref: main
- if: runner.os != 'Windows'
run: echo ${{ github.workspace }}/target/release >> $GITHUB_PATH
- if: runner.os == 'Windows'
run: echo ${{ github.workspace }}/target/release >> $env:GITHUB_PATH
- uses: actions-rust-lang/setup-rust-toolchain@v1
- run: cargo build --release
- uses: ilammy/msvc-dev-cmd@v1
- name: Install and prepare Neovim
run: bash ./scripts/ci-install.sh
working-directory: ${{ env.NVIM_TS_DIR }}
- if: matrix.type == 'generate'
name: Generate and compile parsers
run: $NVIM -l ./scripts/install-parsers.lua --generate --max-jobs=2
working-directory: ${{ env.NVIM_TS_DIR }}
shell: bash
- if: matrix.type == 'build'
name: Compile parsers
run: $NVIM -l ./scripts/install-parsers.lua --max-jobs=10
working-directory: ${{ env.NVIM_TS_DIR }}
shell: bash
- if: "!cancelled()"
name: Check query files
run: $NVIM -l ./scripts/check-queries.lua
working-directory: ${{ env.NVIM_TS_DIR }}
shell: bash

View file

@ -1,5 +1,4 @@
name: Release name: Release
on: on:
workflow_dispatch: workflow_dispatch:
push: push:
@ -10,22 +9,19 @@ jobs:
build: build:
uses: ./.github/workflows/build.yml uses: ./.github/workflows/build.yml
with: with:
run-test: false run_test: false
release: release:
name: Release on GitHub name: Release
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: build needs: build
permissions: permissions:
id-token: write
attestations: write
contents: write contents: write
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Download build artifacts - name: Download build artifacts
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
path: artifacts path: artifacts
@ -35,13 +31,9 @@ jobs:
- name: Prepare release artifacts - name: Prepare release artifacts
run: | run: |
mkdir -p target web mkdir -p target
mv artifacts/tree-sitter.wasm/* web/ mv artifacts/tree-sitter.wasm/* target/
tar -czf target/web-tree-sitter.tar.gz -C web .
rm -r artifacts/tree-sitter.wasm rm -r artifacts/tree-sitter.wasm
for platform in $(cd artifacts; ls | sed 's/^tree-sitter\.//'); do for platform in $(cd artifacts; ls | sed 's/^tree-sitter\.//'); do
exe=$(ls artifacts/tree-sitter.$platform/tree-sitter*) exe=$(ls artifacts/tree-sitter.$platform/tree-sitter*)
gzip --stdout --name $exe > target/tree-sitter-$platform.gz gzip --stdout --name $exe > target/tree-sitter-$platform.gz
@ -49,81 +41,60 @@ jobs:
rm -rf artifacts rm -rf artifacts
ls -l target/ ls -l target/
- name: Generate attestations
uses: actions/attest-build-provenance@v3
with:
subject-path: |
target/tree-sitter-*.gz
target/web-tree-sitter.tar.gz
- name: Create release - name: Create release
run: |- uses: softprops/action-gh-release@v2
gh release create $GITHUB_REF_NAME \ with:
target/tree-sitter-*.gz \ name: ${{ github.ref_name }}
target/web-tree-sitter.tar.gz tag_name: ${{ github.ref_name }}
env: fail_on_unmatched_files: true
GH_TOKEN: ${{ github.token }} files: |
target/tree-sitter-*.gz
target/tree-sitter.wasm
target/tree-sitter.js
crates_io: crates_io:
name: Publish packages to Crates.io name: Publish CLI to Crates.io
runs-on: ubuntu-latest runs-on: ubuntu-latest
environment: crates
permissions:
id-token: write
contents: read
needs: release needs: release
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set up Rust - name: Setup Rust
uses: actions-rust-lang/setup-rust-toolchain@v1 uses: actions-rs/toolchain@v1
with:
- name: Set up registry token profile: minimal
id: auth toolchain: stable
uses: rust-lang/crates-io-auth-action@v1 override: true
- name: Publish crates to Crates.io - name: Publish crates to Crates.io
uses: katyo/publish-crates@v2 uses: katyo/publish-crates@v2
with: with:
registry-token: ${{ steps.auth.outputs.token }} registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}
npm: npm:
name: Publish packages to npmjs.com name: Publish lib to npmjs.com
runs-on: ubuntu-latest runs-on: ubuntu-latest
environment: npm
permissions:
id-token: write
contents: read
needs: release needs: release
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
directory: [crates/cli/npm, lib/binding_web] directory: ["cli/npm", "lib/binding_web"]
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set up Node
uses: actions/setup-node@v6
with:
node-version: 24
registry-url: https://registry.npmjs.org
- name: Set up Rust
uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Build wasm - name: Build wasm
if: matrix.directory == 'lib/binding_web' if: matrix.directory == 'lib/binding_web'
run: ./script/build-wasm
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 18
registry-url: "https://registry.npmjs.org"
- name: Publish lib to npmjs.com
env:
NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}}
run: | run: |
cd ${{ matrix.directory }} cd ${{ matrix.directory }}
npm ci npm publish
npm run build
npm run build:debug
CJS=true npm run build
CJS=true npm run build:debug
npm run build:dts
- name: Publish to npmjs.com
working-directory: ${{ matrix.directory }}
run: npm publish

View file

@ -1,47 +1,34 @@
name: No response name: no_response
on: on:
schedule: schedule:
- cron: "30 1 * * *" # Run every day at 01:30 - cron: '30 1 * * *' # Run every day at 01:30
workflow_dispatch: workflow_dispatch:
issue_comment: issue_comment:
permissions:
issues: write
pull-requests: write
jobs: jobs:
close: close:
name: Close issues with no response
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps: steps:
- name: Checkout script - uses: actions/checkout@v4
uses: actions/checkout@v6 - uses: actions/github-script@v7
with:
sparse-checkout: .github/scripts/close_unresponsive.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with: with:
script: | script: |
const script = require('./.github/scripts/close_unresponsive.js') const script = require('./.github/scripts/close_unresponsive.js')
await script({github, context}) await script({github, context})
remove_label: remove_label:
name: Remove response label
if: github.event_name == 'issue_comment' if: github.event_name == 'issue_comment'
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps: steps:
- name: Checkout script - uses: actions/checkout@v4
uses: actions/checkout@v6 - uses: actions/github-script@v7
with:
sparse-checkout: .github/scripts/remove_response_label.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with: with:
script: | script: |
const script = require('./.github/scripts/remove_response_label.js') const script = require('./.github/scripts/remove_response_label.js')

View file

@ -1,24 +1,16 @@
name: Remove Reviewers name: "reviewers: remove"
on: on:
pull_request_target: pull_request_target:
types: [converted_to_draft, closed] types: [converted_to_draft, closed]
permissions:
pull-requests: write
jobs: jobs:
remove-reviewers: remove-reviewers:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
pull-requests: write
steps: steps:
- name: Checkout script - uses: actions/checkout@v4
uses: actions/checkout@v6 - name: 'Remove reviewers'
with: uses: actions/github-script@v7
sparse-checkout: .github/scripts/reviewers_remove.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with: with:
script: | script: |
const script = require('./.github/scripts/reviewers_remove.js') const script = require('./.github/scripts/reviewers_remove.js')

View file

@ -8,44 +8,39 @@ on:
workflow_call: workflow_call:
jobs: jobs:
check-undefined-behaviour: check_undefined_behaviour:
name: Sanitizer checks
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 20 timeout-minutes: 20
env: env:
TREE_SITTER: ${{ github.workspace }}/target/release/tree-sitter TREE_SITTER: ${{ github.workspace }}/target/release/tree-sitter
steps: steps:
- name: Checkout repository - name: Checkout source code
uses: actions/checkout@v6 uses: actions/checkout@v4
- name: Install UBSAN library - name: Install UBSAN library
run: sudo apt-get update -y && sudo apt-get install -y libubsan1 run: sudo apt-get update -y && sudo apt-get install -y libubsan1
- name: Set up Rust - run: rustup toolchain install stable --profile minimal
uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2
- run: cargo build --release
- run: script/fetch-fixtures
- name: Build project - uses: ./.github/actions/cache
run: cargo build --release id: cache
- name: Cache fixtures - if: ${{ steps.cache.outputs.cache-hit != 'true' }}
uses: ./.github/actions/cache run: script/generate-fixtures
id: cache
- name: Fetch fixtures - name: Run main tests with undefined behaviour sanitizer (UBSAN)
run: cargo xtask fetch-fixtures env:
CFLAGS: -fsanitize=undefined
RUSTFLAGS: ${{ env.RUSTFLAGS }} -lubsan
run: cargo test -- --test-threads 1
- name: Generate fixtures - name: Run main tests with address sanitizer (ASAN)
if: ${{ steps.cache.outputs.cache-hit != 'true' }} env:
run: cargo xtask generate-fixtures ASAN_OPTIONS: verify_asan_link_order=0
CFLAGS: -fsanitize=address
- name: Run main tests with undefined behaviour sanitizer (UBSAN) RUSTFLAGS: ${{ env.RUSTFLAGS }} -lasan --cfg sanitizing
run: cargo test -- --test-threads 1 run: cargo test -- --test-threads 1
env:
CFLAGS: -fsanitize=undefined
RUSTFLAGS: ${{ env.RUSTFLAGS }} -lubsan
- name: Run main tests with address sanitizer (ASAN)
run: cargo test -- --test-threads 1
env:
ASAN_OPTIONS: verify_asan_link_order=0
CFLAGS: -fsanitize=address
RUSTFLAGS: ${{ env.RUSTFLAGS }} -lasan --cfg sanitizing

View file

@ -1,29 +0,0 @@
name: Close as spam
on:
issues:
types: [labeled]
pull_request_target:
types: [labeled]
permissions:
issues: write
pull-requests: write
jobs:
spam:
runs-on: ubuntu-latest
if: github.event.label.name == 'spam'
steps:
- name: Checkout script
uses: actions/checkout@v6
with:
sparse-checkout: .github/scripts/close_spam.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with:
script: |
const script = require('./.github/scripts/close_spam.js')
await script({github, context})

View file

@ -1,41 +0,0 @@
name: Check Wasm Exports
on:
pull_request:
paths:
- lib/include/tree_sitter/api.h
- lib/binding_web/**
- xtask/src/**
push:
branches: [master]
paths:
- lib/include/tree_sitter/api.h
- lib/binding_rust/bindings.rs
- CMakeLists.txt
jobs:
check-wasm-exports:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up stable Rust toolchain
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
- name: Install wasm-objdump
run: sudo apt-get update -y && sudo apt-get install -y wabt
- name: Build C library (make)
run: make -j CFLAGS="$CFLAGS"
env:
CFLAGS: -g -Werror -Wall -Wextra -Wshadow -Wpedantic -Werror=incompatible-pointer-types
- name: Build Wasm Library
working-directory: lib/binding_web
run: npm ci && npm run build:debug
- name: Check Wasm exports
run: cargo xtask check-wasm-exports

View file

@ -1,19 +0,0 @@
name: Check Wasm Stdlib build
on:
workflow_call:
jobs:
check:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Check directory changes
uses: actions/github-script@v8
with:
script: |
const scriptPath = `${process.env.GITHUB_WORKSPACE}/.github/scripts/wasm_stdlib.js`;
const script = require(scriptPath);
return script({ github, context, core });

6
.gitignore vendored
View file

@ -1,12 +1,10 @@
log*.html log*.html
.direnv
.idea .idea
*.xcodeproj *.xcodeproj
.vscode .vscode
.cache .cache
.zig-cache .zig-cache
.direnv
profile* profile*
fuzz-results fuzz-results
@ -14,6 +12,7 @@ test/fuzz/out
test/fixtures/grammars/* test/fixtures/grammars/*
!test/fixtures/grammars/.gitkeep !test/fixtures/grammars/.gitkeep
package-lock.json
node_modules node_modules
docs/assets/js/tree-sitter.js docs/assets/js/tree-sitter.js
@ -26,7 +25,6 @@ docs/assets/js/tree-sitter.js
*.dylib *.dylib
*.so *.so
*.so.[0-9]* *.so.[0-9]*
*.dll
*.o *.o
*.obj *.obj
*.exp *.exp
@ -36,5 +34,3 @@ docs/assets/js/tree-sitter.js
.build .build
build build
zig-* zig-*
/result

View file

@ -1,11 +0,0 @@
{
"lsp": {
"rust-analyzer": {
"initialization_options": {
"cargo": {
"features": "all"
}
}
}
}
}

470
CHANGELOG.md Normal file
View file

@ -0,0 +1,470 @@
# Changelog
## [0.23.0] - 2024-08-26
### Breaking
- Introduce tree-sitter-language crate for grammar crates to depend on (<https://github.com/tree-sitter/tree-sitter/pull/3069>)
- Revert interning of a sequence or choice of a single rule (<https://github.com/tree-sitter/tree-sitter/pull/3548>)
- **bindings**: Use capsules in python (<https://github.com/tree-sitter/tree-sitter/pull/3384>)
- **dsl**: Support other JS runtimes (<https://github.com/tree-sitter/tree-sitter/pull/3355>)
### Features
- Add `fuzz` subcommand (<https://github.com/tree-sitter/tree-sitter/pull/3385>)
- Allow external scanners to use the logger (<https://github.com/tree-sitter/tree-sitter/pull/3204>)
- **bindings**: Add query constants to python
- **bindings**: Add node, python, swift tests (<https://github.com/tree-sitter/tree-sitter/pull/3178>)
- **bindings**: Update npm scripts (<https://github.com/tree-sitter/tree-sitter/pull/3210>)
- **cli**: Bump unicode data to v15.1.0
- **cli**: Add debug build flag (<https://github.com/tree-sitter/tree-sitter/pull/3279>)
- **cli**: Attach helpful context when `grammar.json` cannot be found (<https://github.com/tree-sitter/tree-sitter/pull/3405>)
- **cli**: Add `--show-fields` flag to `test` command (<https://github.com/tree-sitter/tree-sitter/pull/3502>)
- **lib**: Add `ts_query_end_byte_for_pattern` (<https://github.com/tree-sitter/tree-sitter/pull/3451>)
- **lib**: Support no_std
- **zig**: Update outdated path syntax (<https://github.com/tree-sitter/tree-sitter/pull/3383>)
### Bug Fixes
- Always reset to the first language when iterating over language attributes (<https://github.com/tree-sitter/tree-sitter/pull/3375>)
- Better error when a supertype rule is invalid (<https://github.com/tree-sitter/tree-sitter/pull/3400>)
- Intern a sequence or choice of a single element the same as the element itself
- Do not "absorb" rules that consist of a single terminal if the rule is hidden (<https://github.com/tree-sitter/tree-sitter/pull/2577>)
- **bindings**: Update go bindings (<https://github.com/tree-sitter/tree-sitter/pull/3544>)
- **cli**: Installation via authenticated proxy (<https://github.com/tree-sitter/tree-sitter/pull/3414>)
- **cli**: Dedup `preceding_auxiliary_symbols` (<https://github.com/tree-sitter/tree-sitter/pull/3550>)
- **dsl**: Improve error message when a rule function returns undefined (<https://github.com/tree-sitter/tree-sitter/pull/3452>)
- **generate**: Rename `cargo.toml` template (<https://github.com/tree-sitter/tree-sitter/pull/3532>)
- **go**: Update parser name in binding files, add to docs (<https://github.com/tree-sitter/tree-sitter/pull/3547>)
- **lib**: A null clock must have `tv_nsec` be 0 as well (<https://github.com/tree-sitter/tree-sitter/pull/3372>)
- **lib**: Restrict pattern_map optimization when a wildcard step has an immediate first child (<https://github.com/tree-sitter/tree-sitter/pull/3440>)
- **lib**: An empty root node should not precede an empty range (<https://github.com/tree-sitter/tree-sitter/pull/3450>)
- **lib**: Fix api header C++ interop (<https://github.com/tree-sitter/tree-sitter/pull/3534>)
- **make**: Fail properly on Windows (<https://github.com/tree-sitter/tree-sitter/pull/3418>)
- **rust**: Fetch `CARGO_MANIFEST_DIR` at runtime in build script (<https://github.com/tree-sitter/tree-sitter/pull/3352>)
- **rust**: Fix new clippy warnings (<https://github.com/tree-sitter/tree-sitter/pull/3491>)
- **test**: Multi-grammar corpus tests are now in the repo root (<https://github.com/tree-sitter/tree-sitter/pull/3342>)
- **wasm**: Update test
### Performance
- Hoist out common subexpressions in satisfies_text_predicates (<https://github.com/tree-sitter/tree-sitter/pull/3397>)
### Documentation
- Update changelog
- Remove duplicate pr # in changelog
- Add note for bullet
- Fix syntax highlighting unit testing example (<https://github.com/tree-sitter/tree-sitter/pull/3434>)
- Add tsserver annotation to example (<https://github.com/tree-sitter/tree-sitter/pull/3460>)
- Fix tree cursor documentation (<https://github.com/tree-sitter/tree-sitter/pull/3324>)
- Document rust library features (<https://github.com/tree-sitter/tree-sitter/pull/3395>)
- Clean up binding & parser lists (<https://github.com/tree-sitter/tree-sitter/pull/3443>)
### Refactor
- Remove ansi_term dependency (<https://github.com/tree-sitter/tree-sitter/pull/3387>)
- Remove difference dependency (<https://github.com/tree-sitter/tree-sitter/pull/3388>)
- **scripts**: Clean up bash scripts (<https://github.com/tree-sitter/tree-sitter/pull/3231>)
### Testing
- Modernize scanner files (<https://github.com/tree-sitter/tree-sitter/pull/3340>)
### Build System and CI
- **deps**: bump wasmtime, cc, and wasmparser (<https://github.com/tree-sitter/tree-sitter/pull/3529>
- **bindings**: Use language version in soname (<https://github.com/tree-sitter/tree-sitter/pull/3308>)
- **lib**: Include the minor in the soname
- **loader**: Make dependencies optional (<https://github.com/tree-sitter/tree-sitter/pull/1638>)
- **swift**: Declare header search path (<https://github.com/tree-sitter/tree-sitter/pull/3474>)
- **wasm**: Don't minify JS (<https://github.com/tree-sitter/tree-sitter/pull/3380>)
- **wasm**: Bump emscripten to 3.1.64 (<https://github.com/tree-sitter/tree-sitter/pull/3497>)
- **wasm**: Support big endian machines (<https://github.com/tree-sitter/tree-sitter/pull/3492>)
- **zig**: Git ignore updated Zig cache directory (<https://github.com/tree-sitter/tree-sitter/pull/3408>)
### Other
- Swap `sprintf()` for `snprintf()` (<https://github.com/tree-sitter/tree-sitter/pull/3430>)
- Add `.build` to gitignore (<https://github.com/tree-sitter/tree-sitter/pull/3498>)
- Reset language when resetting wasm store (<https://github.com/tree-sitter/tree-sitter/pull/3495>)
- Clone wasm store engine (<https://github.com/tree-sitter/tree-sitter/pull/3542>)
- **bindings**: Fix indent & line endings (<https://github.com/tree-sitter/tree-sitter/pull/3284>)
## [0.22.6] — 2024-05-05
### Features
- Improve handling of serialization buffer overflows (<https://github.com/tree-sitter/tree-sitter/pull/3318>)
- Reverse iteration through node parents (<https://github.com/tree-sitter/tree-sitter/pull/3214>)
- **cli**: Support `NO_COLOR` (<https://github.com/tree-sitter/tree-sitter/pull/3299>)
- **cli**: Add test listing and allow users to parse a specific test number (<https://github.com/tree-sitter/tree-sitter/pull/3067>)
- **grammar**: Add "inherits" field if available (<https://github.com/tree-sitter/tree-sitter/pull/3295>)
### Bug Fixes
- Correctly load field data from wasm languages
- Improve error message when the `tree-sitter` field is malformed
- Don't error out on package.json lookup errors if `--no-bindings` is passed (<https://github.com/tree-sitter/tree-sitter/pull/3323>)
- **cli**: Keep default cc flags in build
- **cli**: Properly account for multi-grammar repos when using docker to build a wasm parser (<https://github.com/tree-sitter/tree-sitter/pull/3337>)
- **generate**: Don't check arbitrarily named dirs
- **generate**: Take `AsRef<Path>` for the path parameter to avoid clones (<https://github.com/tree-sitter/tree-sitter/pull/3322>)
- **highlight**: Correct signature of `ts_highlighter_add_language`
- **lib**: Do not return field names for extras (<https://github.com/tree-sitter/tree-sitter/pull/3330>)
- **lib**: Advance the lookahead end byte by 4 when there's an invalid code point (<https://github.com/tree-sitter/tree-sitter/pull/3305>)
- **rust**: Update README example (<https://github.com/tree-sitter/tree-sitter/pull/3307>)
- **rust**: Use unix + wasi cfg instead of not windows for fd (<https://github.com/tree-sitter/tree-sitter/pull/3304>)
- **test**: Allow newlines in between test name and attribute
- **wasm**: Correct `childrenFromFieldXXX` method signatures (<https://github.com/tree-sitter/tree-sitter/pull/3301>)
- **xtask**: Always bump every crate in tandem
- **zig**: Make usable as a zig dependency (<https://github.com/tree-sitter/tree-sitter/pull/3315>)
### Documentation
- Mention build command variables
- Swap `\s` for `\\s` in query example
- **highlight**: Typo (<https://github.com/tree-sitter/tree-sitter/pull/3290>)
### Refactor
- **tests**: Migrate remaining `grammar.json` tests to `grammar.js` (<https://github.com/tree-sitter/tree-sitter/pull/3325>)
### Build System and CI
- Add nightly rustfmt to workflow for linting (<https://github.com/tree-sitter/tree-sitter/pull/3333>)
- Fix address sanitizer step (<https://github.com/tree-sitter/tree-sitter/pull/3188>)
- **deps**: Bump cc from 1.0.92 to 1.0.94 in the cargo group (<https://github.com/tree-sitter/tree-sitter/pull/3298>)
- **deps**: Bump the cargo group with 6 updates (<https://github.com/tree-sitter/tree-sitter/pull/3313>)
- **xtask**: Bump `build.zig.zon` version when bumping versions
## [0.22.5] — 2024-04-14
### Bug Fixes
- Avoid generating unused character set constants
- **cli**: Test parsing on windows (<https://github.com/tree-sitter/tree-sitter/pull/3289>)
- **rust**: Compilation on wasm32-wasi (<https://github.com/tree-sitter/tree-sitter/pull/3293>)
## [0.22.4] — 2024-04-12
### Bug Fixes
- Fix sorting of transitions within a lex state
- Include 2-character ranges in array-based state transitions
### Build System and CI
- Always bump at least the patch version in bump xtask
## [0.22.3] — 2024-04-12
### Features
- Add strncat to wasm stdlib
- Generate simpler code for matching large character sets (<https://github.com/tree-sitter/tree-sitter/pull/3234>)
- When loading languages via WASM, gracefully handle memory errors and leaks in external scanners (<https://github.com/tree-sitter/tree-sitter/pull/3181>)
### Bug Fixes
- **bindings**: Add utf-8 flag to python & node (<https://github.com/tree-sitter/tree-sitter/pull/3278>)
- **bindings**: Generate parser.c if missing (<https://github.com/tree-sitter/tree-sitter/pull/3277>)
- **bindings**: Remove required platforms for swift (<https://github.com/tree-sitter/tree-sitter/pull/3264>)
- **cli**: Fix mismatched parenthesis when accounting for `&&` (<https://github.com/tree-sitter/tree-sitter/pull/3274>)
- **lib**: Do not consider childless nodes for ts_node_parent (<https://github.com/tree-sitter/tree-sitter/pull/3191>)
- **lib**: Properly account for aliased root nodes and root nodes with
children in `ts_subtree_string` (<https://github.com/tree-sitter/tree-sitter/pull/3191>)
- **lib**: Account for the root node of a tree cursor being an alias (<https://github.com/tree-sitter/tree-sitter/pull/3191>)
- **lib**: Use correct format specifier in log message (<https://github.com/tree-sitter/tree-sitter/pull/3255>)
- **parser**: Fix variadic macro (<https://github.com/tree-sitter/tree-sitter/pull/3229>)
- render: Proper function prototypes (<https://github.com/tree-sitter/tree-sitter/pull/3277>)
- **windows**: Add `/utf-8` flag for parsers using unicode symbols (<https://github.com/tree-sitter/tree-sitter/pull/3223>)
- Add a semicolon after SKIP macros (<https://github.com/tree-sitter/tree-sitter/pull/3264>)
- Add back `build-wasm` temporarily (<https://github.com/tree-sitter/tree-sitter/pull/3203>)
- Add lifetime to matches function (<https://github.com/tree-sitter/tree-sitter/pull/3254>)
- Default output directory for `build --wasm` should use current_dir (<https://github.com/tree-sitter/tree-sitter/pull/3203>)
- Fix sorting of wasm stdlib symbols
- Insert "tree-sitter" section in current directory's package.json if it exists (<https://github.com/tree-sitter/tree-sitter/pull/3224>)
- Tie the lifetime of the cursor to the query in `QueryCursor::captures()` (<https://github.com/tree-sitter/tree-sitter/pull/3266>)
- Wrong flag check in `build.rs`
### Performance
- **cli**: Reduced the compile time of generated parsers by generating C code with fewer conditionals (<https://github.com/tree-sitter/tree-sitter/pull/3234>)
### Documentation
- Add NGINX grammar
### Refactor
- **parser**: Make REDUCE macro non-variadic (<https://github.com/tree-sitter/tree-sitter/pull/3280>)
- **js**: Misc fixes & tidying
- **rust**: Misc fixes & tidying
### Testing
- Add regression test for node parent + string bug (<https://github.com/tree-sitter/tree-sitter/pull/3191>)
- **test**: Allow colons in test names (<https://github.com/tree-sitter/tree-sitter/pull/3264>)
### Build System and CI
- Upgrade wasmtime
- Update emscripten version (<https://github.com/tree-sitter/tree-sitter/pull/3272>)
- **dependabot**: Improve PR labels (<https://github.com/tree-sitter/tree-sitter/pull/3282>)
## [0.22.2] — 2024-03-17
### Breaking
- **cli**: Add a separate build command to compile parsers
### Features
- **bindings/rust**: Expose `Parser::included_ranges`
- Lower the lib's MSRV (<https://github.com/tree-sitter/tree-sitter/pull/3169>)
- **lib**: Implement Display for Node (<https://github.com/tree-sitter/tree-sitter/pull/3177>)
### Bug Fixes
- **bindings/wasm**: Fix `Parser.getIncludedRanges()` (<https://github.com/tree-sitter/tree-sitter/pull/3164>)
- **lib**: Makefile installation on macOS (<https://github.com/tree-sitter/tree-sitter/pull/3167>)
- **lib**: Makefile installation (<https://github.com/tree-sitter/tree-sitter/pull/3173>)
- **lib**: Avoid possible UB of calling memset on a null ptr when 0 is passed into `array_grow_by` (<https://github.com/tree-sitter/tree-sitter/pull/3176>)
- **lib**: Allow hiding symbols (<https://github.com/tree-sitter/tree-sitter/pull/3180>)
### Documentation
- Fix typo (<https://github.com/tree-sitter/tree-sitter/pull/3158>)
- **licensfe**: Update year (<https://github.com/tree-sitter/tree-sitter/pull/3183>)
### Refactor
- Remove dependency on which crate (<https://github.com/tree-sitter/tree-sitter/pull/3172>)
- Turbofish styling
### Testing
- Fix header writes (<https://github.com/tree-sitter/tree-sitter/pull/3174>)
### Build System and CI
- Simplify workflows (<https://github.com/tree-sitter/tree-sitter/pull/3002>)
- **lib**: Allow overriding CFLAGS on the commandline (<https://github.com/tree-sitter/tree-sitter/pull/3159>)
## [0.22.1] — 2024-03-10
### Bug Fixes
- Cli build script behavior on release
## [0.22.0] — 2024-03-10
### Breaking
- Remove top-level `corpus` dir for tests
The cli will now only look in `test/corpus` for tests
- Remove redundant escape regex & curly brace regex preprocessing (<https://github.com/tree-sitter/tree-sitter/pull/2838>)
- **bindings**: Convert node bindings to NAPI (<https://github.com/tree-sitter/tree-sitter/pull/3077>)
- **wasm**: Make `current*`, `is*`, and `has*` methods properties (<https://github.com/tree-sitter/tree-sitter/pull/3103>)
- **wasm**: Keep API in-line with upstream and start aligning with node (<https://github.com/tree-sitter/tree-sitter/pull/3149>)
### Features
- Add xtasks to assist with bumping crates (<https://github.com/tree-sitter/tree-sitter/pull/3065>)
- Improve language bindings (<https://github.com/tree-sitter/tree-sitter/pull/2438>)
- Expose the allocator and array header files for external scanners (<https://github.com/tree-sitter/tree-sitter/pull/3063>)
- Add typings for the node bindings
- Replace `nan` with `node-addon-api` and conditionally print logs
- **bindings**: Add more make targets
- **bindings**: Add peerDependencies for npm
- **bindings**: Add prebuildify to node
- **bindings**: Remove dsl types file (<https://github.com/tree-sitter/tree-sitter/pull/3126>)
- **node**: Type tag the language (<https://github.com/tree-sitter/tree-sitter/pull/3109>)
- **test**: Add attributes for corpus tests
### Bug Fixes
- Apply some `scan-build` suggestions (unused assignment/garbage access) (<https://github.com/tree-sitter/tree-sitter/pull/3056>)
- Wrap `||` comparison in parentheses when `&&` is used (<https://github.com/tree-sitter/tree-sitter/pull/3070>)
- Ignore unused variables in the array macros (<https://github.com/tree-sitter/tree-sitter/pull/3083>)
- `binding.cc` overwrite should replace `PARSER_NAME` (<https://github.com/tree-sitter/tree-sitter/pull/3116>)
- Don't use `__declspec(dllexport)` on windows (<https://github.com/tree-sitter/tree-sitter/pull/3128>)
- Parsers should export the language function on windows
- Allow the regex `v` flag (<https://github.com/tree-sitter/tree-sitter/pull/3154>)
- **assertions**: Case shouldn't matter for comment node detection
- **bindings**: Editorconfig and setup.py fixes (<https://github.com/tree-sitter/tree-sitter/pull/3082>)
- **bindings**: Insert `types` after `main` if it exists (<https://github.com/tree-sitter/tree-sitter/pull/3122>)
- **bindings**: Fix template oversights (<https://github.com/tree-sitter/tree-sitter/pull/3155>)
- **cli**: Only output the sources with `--no-bindings` (<https://github.com/tree-sitter/tree-sitter/pull/3123>)
- **generate**: Add `.npmignore`, populate Swift's exclude list (<https://github.com/tree-sitter/tree-sitter/pull/3085>)
- **generate**: Extern allocator functions for the template don't need to be "exported" (<https://github.com/tree-sitter/tree-sitter/pull/3132>)
- **generate**: Camel case name in `Cargo.toml` description (<https://github.com/tree-sitter/tree-sitter/pull/3140>)
- **lib**: Include `api.h` so `ts_set_allocator` is visible (<https://github.com/tree-sitter/tree-sitter/pull/3092>)
### Documentation
- Add GitHub user and PR info to the changelog
- Add css for inline code (<https://github.com/tree-sitter/tree-sitter/pull/2844>)
- Document test attributes
- Add `Ohm` language parser
- Remove duplicate `the`'s (<https://github.com/tree-sitter/tree-sitter/pull/3120>)
- Add discord and matrix badges (<https://github.com/tree-sitter/tree-sitter/pull/3148>)
### Refactor
- Rename TS_REUSE_ALLOCATOR flag (<https://github.com/tree-sitter/tree-sitter/pull/3088>)
- Remove extern/const where possible
- **array**: Use pragma GCC in clang too
- **bindings**: Remove npmignore (<https://github.com/tree-sitter/tree-sitter/pull/3089>)
### Testing
- Don't use TS_REUSE_ALLOCATOR on Darwin systems (<https://github.com/tree-sitter/tree-sitter/pull/3087>)
- Add test case for parse stack merging with incorrect error cost bug (<https://github.com/tree-sitter/tree-sitter/pull/3098>)
### Build System and CI
- Improve changelog settings (<https://github.com/tree-sitter/tree-sitter/pull/3064>)
- Unify crate versions via workspace (<https://github.com/tree-sitter/tree-sitter/pull/3074>)
- Update `cc` to remove annoying debug output (<https://github.com/tree-sitter/tree-sitter/pull/3075>)
- Adjust dependabot settings (<https://github.com/tree-sitter/tree-sitter/pull/3079>)
- Use c11 everywhere
- Add uninstall command
- Don't skip tests on failing lint (<https://github.com/tree-sitter/tree-sitter/pull/3102>)
- Remove unused deps, bump deps, and bump MSRV to 1.74.1 (<https://github.com/tree-sitter/tree-sitter/pull/3153>)
- **bindings**: Metadata improvements
- **bindings**: Make everything c11 (<https://github.com/tree-sitter/tree-sitter/pull/3099>)
- **dependabot**: Update weekly instead of daily (<https://github.com/tree-sitter/tree-sitter/pull/3112>)
- **deps**: Bump the cargo group with 1 update (<https://github.com/tree-sitter/tree-sitter/pull/3081>)
- **deps**: Bump the cargo group with 1 update (<https://github.com/tree-sitter/tree-sitter/pull/3097>)
- **deps**: Bump deps & lockfile (<https://github.com/tree-sitter/tree-sitter/pull/3060>)
- **deps**: Bump the cargo group with 4 updates (<https://github.com/tree-sitter/tree-sitter/pull/3134>)
- **lint**: Detect if `Cargo.lock` needs to be updated (<https://github.com/tree-sitter/tree-sitter/pull/3066>)
- **lint**: Make lockfile check quiet (<https://github.com/tree-sitter/tree-sitter/pull/3078>)
- **swift**: Move 'cLanguageStandard' behind 'targets' (<https://github.com/tree-sitter/tree-sitter/pull/3101>)
### Other
- Make Node.js language bindings context aware (<https://github.com/tree-sitter/tree-sitter/pull/2841>)
They don't have any dynamic global data, so all it takes is just declaring them as such
- Fix crash when attempting to load ancient languages via wasm (<https://github.com/tree-sitter/tree-sitter/pull/3068>)
- Use workspace dependencies for internal crates like Tree-sitter (<https://github.com/tree-sitter/tree-sitter/pull/3076>)
- Remove vendored wasmtime headers (<https://github.com/tree-sitter/tree-sitter/pull/3084>)
When building rust binding, use wasmtime headers provided via cargo
by the wasmtime-c-api crate.
- Fix invalid parse stack recursive merging with mismatched error cost (<https://github.com/tree-sitter/tree-sitter/pull/3086>)
Allowing this invalid merge caused an invariant to be violated
later on during parsing, when handling a later error.
- Fix regression in `subtree_compare` (<https://github.com/tree-sitter/tree-sitter/pull/3111>)
- docs: Add `Ohm` language parser (<https://github.com/tree-sitter/tree-sitter/pull/3114>)
- Delete `binding_files.rs` (<https://github.com/tree-sitter/tree-sitter/pull/3106>)
- **bindings**: Consistent wording (<https://github.com/tree-sitter/tree-sitter/pull/3096>)
- **bindings**: Ignore more artifacts (<https://github.com/tree-sitter/tree-sitter/pull/3119>)
## [0.21.0] — 2024-02-21
### Breaking
- Remove the apply-all-captures flag, make last-wins precedence the default
**NOTE**: This change might cause breakage in your grammar's highlight tests.
Just flip the order around of the relevant queries, and keep in mind that the
last query that matches will win.
### Features
- Use lockfiles to dedup recompilation
- Improve error message for files with an unknown grammar path (<https://github.com/tree-sitter/tree-sitter/pull/2475>)
- Implement first-line-regex (<https://github.com/tree-sitter/tree-sitter/pull/2479>)
- Error out if an empty string is in the `extras` array
- Allow specifying an external scanner's files (<https://github.com/tree-sitter/tree-sitter/pull/3031>)
- Better error info when a scanner is missing required symbols
- **cli**: Add an optional `grammar-path` argument for the playground (<https://github.com/tree-sitter/tree-sitter/pull/3014>)
- **cli**: Add optional `config-path` argument (<https://github.com/tree-sitter/tree-sitter/pull/3050>)
- **loader**: Add more commonly used default parser directories
### Bug Fixes
- Prettify xml output and add node position info (<https://github.com/tree-sitter/tree-sitter/pull/2970>)
- Inherited grammar generation
- Properly error out when the word property is an invalid rule
- Update schema for regex flags (<https://github.com/tree-sitter/tree-sitter/pull/3006>)
- Properly handle `Query.matches` when filtering out results (<https://github.com/tree-sitter/tree-sitter/pull/3013>)
- Sexp format edge case with quoted closed parenthesis (<https://github.com/tree-sitter/tree-sitter/pull/3016>)
- Always push the default files if there's no `externals`
- Don't log NUL characters (<https://github.com/tree-sitter/tree-sitter/pull/3037>)
- Don't throw an error if the user uses `map` in the grammar (<https://github.com/tree-sitter/tree-sitter/pull/3041>)
- Remove redundant imports (<https://github.com/tree-sitter/tree-sitter/pull/3047>)
- **cli**: Installation via a HTTP tunnel proxy (<https://github.com/tree-sitter/tree-sitter/pull/2824>)
- **cli**: Don't update tests automatically if parse errors are detected (<https://github.com/tree-sitter/tree-sitter/pull/3033>)
- **cli**: Don't use `long` for `grammar_path`
- **test**: Allow writing updates to tests without erroneous nodes instead of denying all of them if a single error is found
- **test**: Edge case when parsing `UNEXPECTED`/`MISSING` nodes with an indentation level greater than 0
- **wasm**: Remove C++ mangled symbols (<https://github.com/tree-sitter/tree-sitter/pull/2971>)
### Documentation
- Create issue template (<https://github.com/tree-sitter/tree-sitter/pull/2978>)
- Document regex limitations
- Mention that `token($.foo)` is illegal
- Explicitly mention behavior of walking outside the given "root" node for a `TSTreeCursor` (<https://github.com/tree-sitter/tree-sitter/pull/3021>)
- Small fixes (<https://github.com/tree-sitter/tree-sitter/pull/2987>)
- Add `Tact` language parser (<https://github.com/tree-sitter/tree-sitter/pull/3030>)
- **web**: Provide deno usage information (<https://github.com/tree-sitter/tree-sitter/pull/2498>)
### Refactor
- Extract regex check into a function and lower its precedence
- `&PathBuf` -> `&Path` (<https://github.com/tree-sitter/tree-sitter/pull/3035>)
- Name anonymous types in api.h (<https://github.com/tree-sitter/tree-sitter/pull/1659>)
### Testing
- Add quotes around bash variables (<https://github.com/tree-sitter/tree-sitter/pull/3023>)
- Update html tests
### Build System and CI
- Only create release for normal semver tags (<https://github.com/tree-sitter/tree-sitter/pull/2973>)
- Add useful development targets to makefile (<https://github.com/tree-sitter/tree-sitter/pull/2979>)
- Remove minimum glibc information in summary page (<https://github.com/tree-sitter/tree-sitter/pull/2988>)
- Use the native m1 mac runner (<https://github.com/tree-sitter/tree-sitter/pull/2995>)
- Add editorconfig (<https://github.com/tree-sitter/tree-sitter/pull/2998>)
- Remove symbolic links from repository (<https://github.com/tree-sitter/tree-sitter/pull/2997>)
- Move common Cargo.toml keys into the workspace and inherit them (<https://github.com/tree-sitter/tree-sitter/pull/3019>)
- Remove reviewers when drafting or closing a PR (<https://github.com/tree-sitter/tree-sitter/pull/2963>)
- Enable creating changelogs with git-cliff (<https://github.com/tree-sitter/tree-sitter/pull/3040>)
- Cache fixtures (<https://github.com/tree-sitter/tree-sitter/pull/3038>)
- Don't cancel jobs on master (<https://github.com/tree-sitter/tree-sitter/pull/3052>)
- Relax caching requirements (<https://github.com/tree-sitter/tree-sitter/pull/3051>)
- **deps**: Bump clap from 4.4.18 to 4.5.0 (<https://github.com/tree-sitter/tree-sitter/pull/3007>)
- **deps**: Bump wasmtime from v16.0.0 to v17.0.1 (<https://github.com/tree-sitter/tree-sitter/pull/3008>)
- **deps**: Bump wasmtime to v18.0.1 (<https://github.com/tree-sitter/tree-sitter/pull/3057>)
- **sanitize**: Add a timeout of 60 minutes (<https://github.com/tree-sitter/tree-sitter/pull/3017>)
- **sanitize**: Reduce timeout to 20 minutes (<https://github.com/tree-sitter/tree-sitter/pull/3054>)
### Other
- Document preferred language for scanner (<https://github.com/tree-sitter/tree-sitter/pull/2972>)
- Add java and tsx to corpus tests (<https://github.com/tree-sitter/tree-sitter/pull/2992>)
- Provide a CLI flag to open `log.html` (<https://github.com/tree-sitter/tree-sitter/pull/2996>)
- Some more clippy lints (<https://github.com/tree-sitter/tree-sitter/pull/3010>)
- Remove deprecated query parsing mechanism (<https://github.com/tree-sitter/tree-sitter/pull/3011>)
- Print out full compiler arguments ran when it fails (<https://github.com/tree-sitter/tree-sitter/pull/3018>)
- Deprecate C++ scanners (<https://github.com/tree-sitter/tree-sitter/pull/3020>)
- Add some documentation to the playground page (<https://github.com/tree-sitter/tree-sitter/pull/1495>)
- Update relevant rust tests (<https://github.com/tree-sitter/tree-sitter/pull/2947>)
- Clippy lints (<https://github.com/tree-sitter/tree-sitter/pull/3032>)
- Error out when multiple arguments are passed to `token`/`token.immediate` (<https://github.com/tree-sitter/tree-sitter/pull/3036>)
- Tidying
- Prefer turbofish syntax where possible (<https://github.com/tree-sitter/tree-sitter/pull/3048>)
- Use published wasmtime crates
- Cleaner cast
- Update `Cargo.lock`
- Get rid of `github_issue_test` file (<https://github.com/tree-sitter/tree-sitter/pull/3055>)
- **cli**: Use spawn to display `emcc`'s stdout and stderr (<https://github.com/tree-sitter/tree-sitter/pull/2494>)
- **cli**: Warn users when a query path needed for a subcommand isn't specified in a grammar's package.json
- **generate**: Dedup and warn about duplicate or invalid rules (<https://github.com/tree-sitter/tree-sitter/pull/2994>)
- **test**: Use different languages for async tests (<https://github.com/tree-sitter/tree-sitter/pull/2953>)
- **wasm**: Use `SIDE_MODULE=2` to silence warning (<https://github.com/tree-sitter/tree-sitter/pull/3003>)

View file

@ -1,95 +0,0 @@
cmake_minimum_required(VERSION 3.13)
project(tree-sitter
VERSION "0.27.0"
DESCRIPTION "An incremental parsing system for programming tools"
HOMEPAGE_URL "https://tree-sitter.github.io/tree-sitter/"
LANGUAGES C)
option(BUILD_SHARED_LIBS "Build using shared libraries" ON)
option(TREE_SITTER_FEATURE_WASM "Enable the Wasm feature" OFF)
option(AMALGAMATED "Build using an amalgamated source" OFF)
if(AMALGAMATED)
set(TS_SOURCE_FILES "${PROJECT_SOURCE_DIR}/lib/src/lib.c")
else()
file(GLOB TS_SOURCE_FILES lib/src/*.c)
list(REMOVE_ITEM TS_SOURCE_FILES "${PROJECT_SOURCE_DIR}/lib/src/lib.c")
endif()
add_library(tree-sitter ${TS_SOURCE_FILES})
target_include_directories(tree-sitter PRIVATE lib/src lib/src/wasm PUBLIC lib/include)
if(MSVC)
target_compile_options(tree-sitter PRIVATE
/wd4018 # disable 'signed/unsigned mismatch'
/wd4232 # disable 'nonstandard extension used'
/wd4244 # disable 'possible loss of data'
/wd4267 # disable 'possible loss of data (size_t)'
/wd4701 # disable 'potentially uninitialized local variable'
/we4022 # treat 'incompatible types' as an error
/W4)
else()
target_compile_options(tree-sitter PRIVATE
-Wall -Wextra -Wshadow -Wpedantic
-Werror=incompatible-pointer-types)
endif()
if(TREE_SITTER_FEATURE_WASM)
if(NOT DEFINED CACHE{WASMTIME_INCLUDE_DIR})
message(CHECK_START "Looking for wasmtime headers")
find_path(WASMTIME_INCLUDE_DIR wasmtime.h
PATHS ENV DEP_WASMTIME_C_API_INCLUDE)
if(NOT WASMTIME_INCLUDE_DIR)
unset(WASMTIME_INCLUDE_DIR CACHE)
message(FATAL_ERROR "Could not find wasmtime headers.\nDid you forget to set CMAKE_INCLUDE_PATH?")
endif()
message(CHECK_PASS "found")
endif()
if(NOT DEFINED CACHE{WASMTIME_LIBRARY})
message(CHECK_START "Looking for wasmtime library")
find_library(WASMTIME_LIBRARY wasmtime)
if(NOT WASMTIME_LIBRARY)
unset(WASMTIME_LIBRARY CACHE)
message(FATAL_ERROR "Could not find wasmtime library.\nDid you forget to set CMAKE_LIBRARY_PATH?")
endif()
message(CHECK_PASS "found")
endif()
target_compile_definitions(tree-sitter PUBLIC TREE_SITTER_FEATURE_WASM)
target_include_directories(tree-sitter SYSTEM PRIVATE "${WASMTIME_INCLUDE_DIR}")
target_link_libraries(tree-sitter PUBLIC "${WASMTIME_LIBRARY}")
set_property(TARGET tree-sitter PROPERTY C_STANDARD_REQUIRED ON)
if(NOT BUILD_SHARED_LIBS)
if(WIN32)
target_compile_definitions(tree-sitter PRIVATE WASM_API_EXTERN= WASI_API_EXTERN=)
target_link_libraries(tree-sitter INTERFACE ws2_32 advapi32 userenv ntdll shell32 ole32 bcrypt)
elseif(NOT APPLE)
target_link_libraries(tree-sitter INTERFACE pthread dl m)
endif()
endif()
endif()
set_target_properties(tree-sitter
PROPERTIES
C_STANDARD 11
C_VISIBILITY_PRESET hidden
POSITION_INDEPENDENT_CODE ON
SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}"
DEFINE_SYMBOL "")
target_compile_definitions(tree-sitter PRIVATE _POSIX_C_SOURCE=200112L _DEFAULT_SOURCE _BSD_SOURCE _DARWIN_C_SOURCE)
include(GNUInstallDirs)
configure_file(lib/tree-sitter.pc.in "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc" @ONLY)
install(FILES lib/include/tree_sitter/api.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/tree_sitter")
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
install(TARGETS tree-sitter
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")

View file

@ -1 +1 @@
See [docs/src/6-contributing.md](./docs/src/6-contributing.md) See [section-6-contributing.md](./docs/section-6-contributing.md)

2164
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,86 +1,28 @@
[workspace] [workspace]
default-members = ["crates/cli"] default-members = ["cli"]
members = [ members = [
"crates/cli", "cli",
"crates/config", "cli/config",
"crates/generate", "cli/loader",
"crates/highlight",
"crates/loader",
"crates/tags",
"crates/xtask",
"crates/language",
"lib", "lib",
"lib/language",
"tags",
"highlight",
"xtask",
] ]
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "0.27.0" version = "0.23.0"
authors = [ authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
"Max Brunsfeld <maxbrunsfeld@gmail.com>",
"Amaan Qureshi <amaanq12@gmail.com>",
]
edition = "2021" edition = "2021"
rust-version = "1.85" rust-version = "1.74.1"
homepage = "https://tree-sitter.github.io/tree-sitter" homepage = "https://tree-sitter.github.io/tree-sitter"
repository = "https://github.com/tree-sitter/tree-sitter" repository = "https://github.com/tree-sitter/tree-sitter"
license = "MIT" license = "MIT"
keywords = ["incremental", "parsing"] keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"] categories = ["command-line-utilities", "parsing"]
[workspace.lints.clippy]
dbg_macro = "deny"
todo = "deny"
pedantic = { level = "warn", priority = -1 }
nursery = { level = "warn", priority = -1 }
cargo = { level = "warn", priority = -1 }
# The lints below are a specific subset of the pedantic+nursery lints
# that we explicitly allow in the tree-sitter codebase because they either:
#
# 1. Contain false positives,
# 2. Are unnecessary, or
# 3. Worsen the code
branches_sharing_code = "allow"
cast_lossless = "allow"
cast_possible_truncation = "allow"
cast_possible_wrap = "allow"
cast_precision_loss = "allow"
cast_sign_loss = "allow"
checked_conversions = "allow"
cognitive_complexity = "allow"
collection_is_never_read = "allow"
fallible_impl_from = "allow"
fn_params_excessive_bools = "allow"
inline_always = "allow"
if_not_else = "allow"
items_after_statements = "allow"
match_wildcard_for_single_variants = "allow"
missing_errors_doc = "allow"
missing_panics_doc = "allow"
module_name_repetitions = "allow"
multiple_crate_versions = "allow"
needless_for_each = "allow"
obfuscated_if_else = "allow"
option_if_let_else = "allow"
or_fun_call = "allow"
range_plus_one = "allow"
redundant_clone = "allow"
redundant_closure_for_method_calls = "allow"
ref_option = "allow"
similar_names = "allow"
string_lit_as_bytes = "allow"
struct_excessive_bools = "allow"
struct_field_names = "allow"
transmute_undefined_repr = "allow"
too_many_lines = "allow"
unnecessary_wraps = "allow"
unused_self = "allow"
used_underscore_items = "allow"
[workspace.lints.rust]
mismatched_lifetime_syntaxes = "allow"
[profile.optimize] [profile.optimize]
inherits = "release" inherits = "release"
strip = true # Automatically strip symbols from the binary. strip = true # Automatically strip symbols from the binary.
@ -92,72 +34,62 @@ codegen-units = 1 # Maximum size reduction optimizations.
inherits = "optimize" inherits = "optimize"
opt-level = "s" # Optimize for size. opt-level = "s" # Optimize for size.
[profile.release-dev] [profile.profile]
inherits = "release" inherits = "optimize"
lto = false strip = false
debug = true
debug-assertions = true
overflow-checks = true
incremental = true
codegen-units = 256
[workspace.dependencies] [workspace.dependencies]
ansi_colours = "1.2.3" anstyle = "1.0.8"
anstyle = "1.0.13" anyhow = "1.0.89"
anyhow = "1.0.100" bstr = "1.10.0"
bstr = "1.12.0" cc = "1.1.21"
cc = "1.2.53" clap = { version = "4.5.18", features = [
clap = { version = "4.5.54", features = [
"cargo", "cargo",
"derive", "derive",
"env", "env",
"help", "help",
"string",
"unstable-styles", "unstable-styles",
] } ] }
clap_complete = "4.5.65" clap_complete = "4.5.29"
clap_complete_nushell = "4.5.10" ctor = "0.2.8"
crc32fast = "1.5.0" ctrlc = { version = "3.4.5", features = ["termination"] }
ctor = "0.2.9" dirs = "5.0.1"
ctrlc = { version = "3.5.0", features = ["termination"] } filetime = "0.2.25"
dialoguer = { version = "0.11.0", features = ["fuzzy-select"] } fs4 = "0.8.4"
etcetera = "0.11.0" git2 = "0.18.3"
fs4 = "0.12.0" glob = "0.3.1"
glob = "0.3.3"
heck = "0.5.0" heck = "0.5.0"
html-escape = "0.2.13" html-escape = "0.2.13"
indexmap = "2.12.1" indexmap = "2.5.0"
indoc = "2.0.6" indoc = "2.0.5"
libloading = "0.9.0" lazy_static = "1.5.0"
log = { version = "0.4.28", features = ["std"] } libloading = "0.8.5"
memchr = "2.7.6" log = { version = "0.4.22", features = ["std"] }
once_cell = "1.21.3" memchr = "2.7.4"
once_cell = "1.19.0"
path-slash = "0.2.1"
pretty_assertions = "1.4.1" pretty_assertions = "1.4.1"
rand = "0.8.5" rand = "0.8.5"
regex = "1.11.3" regex = "1.10.6"
regex-syntax = "0.8.6" regex-syntax = "0.8.4"
rustc-hash = "2.1.1" rustc-hash = "1.1.0"
schemars = "1.0.5" semver = "1.0.23"
semver = { version = "1.0.27", features = ["serde"] } serde = { version = "1.0.210", features = ["derive"] }
serde = { version = "1.0.219", features = ["derive"] } serde_derive = "1.0.197"
serde_json = { version = "1.0.149", features = ["preserve_order"] } serde_json = { version = "1.0.128", features = ["preserve_order"] }
similar = "2.7.0" similar = "2.6.0"
smallbitvec = "2.6.0" smallbitvec = "2.5.3"
streaming-iterator = "0.1.9" tempfile = "3.12.0"
tempfile = "3.23.0" thiserror = "1.0.64"
thiserror = "2.0.17"
tiny_http = "0.12.0" tiny_http = "0.12.0"
topological-sort = "0.2.2" toml = "0.8.19"
unindent = "0.2.4" unindent = "0.2.3"
walkdir = "2.5.0" walkdir = "2.5.0"
wasmparser = "0.243.0" wasmparser = "0.215.0"
webbrowser = "1.0.5" webbrowser = "1.0.2"
tree-sitter = { version = "0.27.0", path = "./lib" } tree-sitter = { version = "0.23.0", path = "./lib" }
tree-sitter-generate = { version = "0.27.0", path = "./crates/generate" } tree-sitter-loader = { version = "0.23.0", path = "./cli/loader" }
tree-sitter-loader = { version = "0.27.0", path = "./crates/loader" } tree-sitter-config = { version = "0.23.0", path = "./cli/config" }
tree-sitter-config = { version = "0.27.0", path = "./crates/config" } tree-sitter-highlight = { version = "0.23.0", path = "./highlight" }
tree-sitter-highlight = { version = "0.27.0", path = "./crates/highlight" } tree-sitter-tags = { version = "0.23.0", path = "./tags" }
tree-sitter-tags = { version = "0.27.0", path = "./crates/tags" }
tree-sitter-language = { version = "0.1", path = "./crates/language" }

View file

@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2018 Max Brunsfeld Copyright (c) 2018-2024 Max Brunsfeld
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View file

@ -1,4 +1,8 @@
VERSION := 0.27.0 ifeq ($(OS),Windows_NT)
$(error Windows is not supported)
endif
VERSION := 0.23.0
DESCRIPTION := An incremental parsing system for programming tools DESCRIPTION := An incremental parsing system for programming tools
HOMEPAGE_URL := https://tree-sitter.github.io/tree-sitter/ HOMEPAGE_URL := https://tree-sitter.github.io/tree-sitter/
@ -6,7 +10,6 @@ HOMEPAGE_URL := https://tree-sitter.github.io/tree-sitter/
PREFIX ?= /usr/local PREFIX ?= /usr/local
INCLUDEDIR ?= $(PREFIX)/include INCLUDEDIR ?= $(PREFIX)/include
LIBDIR ?= $(PREFIX)/lib LIBDIR ?= $(PREFIX)/lib
BINDIR ?= $(PREFIX)/bin
PCLIBDIR ?= $(LIBDIR)/pkgconfig PCLIBDIR ?= $(LIBDIR)/pkgconfig
# collect sources # collect sources
@ -22,9 +25,8 @@ OBJ := $(SRC:.c=.o)
# define default flags, and override to append mandatory flags # define default flags, and override to append mandatory flags
ARFLAGS := rcs ARFLAGS := rcs
CFLAGS ?= -O3 -Wall -Wextra -Wshadow -Wpedantic -Werror=incompatible-pointer-types CFLAGS ?= -O3 -Wall -Wextra -Wshadow -pedantic
override CFLAGS += -std=c11 -fPIC -fvisibility=hidden override CFLAGS += -std=c11 -fPIC -fvisibility=hidden
override CFLAGS += -D_POSIX_C_SOURCE=200112L -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_DARWIN_C_SOURCE
override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include
# ABI versioning # ABI versioning
@ -32,25 +34,20 @@ SONAME_MAJOR := $(word 1,$(subst ., ,$(VERSION)))
SONAME_MINOR := $(word 2,$(subst ., ,$(VERSION))) SONAME_MINOR := $(word 2,$(subst ., ,$(VERSION)))
# OS-specific bits # OS-specific bits
MACHINE := $(shell $(CC) -dumpmachine) ifneq ($(findstring darwin,$(shell $(CC) -dumpmachine)),)
ifneq ($(findstring darwin,$(MACHINE)),)
SOEXT = dylib SOEXT = dylib
SOEXTVER_MAJOR = $(SONAME_MAJOR).$(SOEXT) SOEXTVER_MAJOR = $(SONAME_MAJOR).$(SOEXT)
SOEXTVER = $(SONAME_MAJOR).$(SONAME_MINOR).$(SOEXT) SOEXTVER = $(SONAME_MAJOR).$(SONAME_MINOR).$(SOEXT)
LINKSHARED += -dynamiclib -Wl,-install_name,$(LIBDIR)/libtree-sitter.$(SOEXTVER) LINKSHARED += -dynamiclib -Wl,-install_name,$(LIBDIR)/libtree-sitter.$(SOEXTVER)
else ifneq ($(findstring mingw32,$(MACHINE)),)
SOEXT = dll
LINKSHARED += -s -shared -Wl,--out-implib,libtree-sitter.dll.a
else else
SOEXT = so SOEXT = so
SOEXTVER_MAJOR = $(SOEXT).$(SONAME_MAJOR) SOEXTVER_MAJOR = $(SOEXT).$(SONAME_MAJOR)
SOEXTVER = $(SOEXT).$(SONAME_MAJOR).$(SONAME_MINOR) SOEXTVER = $(SOEXT).$(SONAME_MAJOR).$(SONAME_MINOR)
LINKSHARED += -shared -Wl,-soname,libtree-sitter.$(SOEXTVER) LINKSHARED += -shared -Wl,-soname,libtree-sitter.$(SOEXTVER)
endif
ifneq ($(filter $(shell uname),FreeBSD NetBSD DragonFly),) ifneq ($(filter $(shell uname),FreeBSD NetBSD DragonFly),)
PCLIBDIR := $(PREFIX)/libdata/pkgconfig PCLIBDIR := $(PREFIX)/libdata/pkgconfig
endif endif
endif
all: libtree-sitter.a libtree-sitter.$(SOEXT) tree-sitter.pc all: libtree-sitter.a libtree-sitter.$(SOEXT) tree-sitter.pc
@ -63,39 +60,25 @@ ifneq ($(STRIP),)
$(STRIP) $@ $(STRIP) $@
endif endif
ifneq ($(findstring mingw32,$(MACHINE)),)
libtree-sitter.dll.a: libtree-sitter.$(SOEXT)
endif
tree-sitter.pc: lib/tree-sitter.pc.in tree-sitter.pc: lib/tree-sitter.pc.in
sed -e 's|@PROJECT_VERSION@|$(VERSION)|' \ sed -e 's|@PROJECT_VERSION@|$(VERSION)|' \
-e 's|@CMAKE_INSTALL_LIBDIR@|$(LIBDIR:$(PREFIX)/%=%)|' \ -e 's|@CMAKE_INSTALL_LIBDIR@|$(LIBDIR)|' \
-e 's|@CMAKE_INSTALL_INCLUDEDIR@|$(INCLUDEDIR:$(PREFIX)/%=%)|' \ -e 's|@CMAKE_INSTALL_INCLUDEDIR@|$(INCLUDEDIR)|' \
-e 's|@PROJECT_DESCRIPTION@|$(DESCRIPTION)|' \ -e 's|@PROJECT_DESCRIPTION@|$(DESCRIPTION)|' \
-e 's|@PROJECT_HOMEPAGE_URL@|$(HOMEPAGE_URL)|' \ -e 's|@PROJECT_HOMEPAGE_URL@|$(HOMEPAGE_URL)|' \
-e 's|@CMAKE_INSTALL_PREFIX@|$(PREFIX)|' $< > $@ -e 's|@CMAKE_INSTALL_PREFIX@|$(PREFIX)|' $< > $@
shared: libtree-sitter.$(SOEXT)
static: libtree-sitter.a
clean: clean:
$(RM) $(OBJ) tree-sitter.pc libtree-sitter.a libtree-sitter.$(SOEXT) libtree-stitter.dll.a $(RM) $(OBJ) tree-sitter.pc libtree-sitter.a libtree-sitter.$(SOEXT)
install: all install: all
install -d '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter '$(DESTDIR)$(PCLIBDIR)' '$(DESTDIR)$(LIBDIR)' install -d '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter '$(DESTDIR)$(PCLIBDIR)' '$(DESTDIR)$(LIBDIR)'
install -m644 lib/include/tree_sitter/api.h '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/api.h install -m644 lib/include/tree_sitter/api.h '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/api.h
install -m644 tree-sitter.pc '$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc install -m644 tree-sitter.pc '$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc
install -m644 libtree-sitter.a '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a install -m644 libtree-sitter.a '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a
ifneq ($(findstring mingw32,$(MACHINE)),)
install -d '$(DESTDIR)$(BINDIR)'
install -m755 libtree-sitter.dll '$(DESTDIR)$(BINDIR)'/libtree-sitter.dll
install -m755 libtree-sitter.dll.a '$(DESTDIR)$(LIBDIR)'/libtree-sitter.dll.a
else
install -m755 libtree-sitter.$(SOEXT) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER) install -m755 libtree-sitter.$(SOEXT) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER)
cd '$(DESTDIR)$(LIBDIR)' && ln -sf libtree-sitter.$(SOEXTVER) libtree-sitter.$(SOEXTVER_MAJOR) ln -sf libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER_MAJOR)
cd '$(DESTDIR)$(LIBDIR)' && ln -sf libtree-sitter.$(SOEXTVER_MAJOR) libtree-sitter.$(SOEXT) ln -sf libtree-sitter.$(SOEXTVER_MAJOR) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXT)
endif
uninstall: uninstall:
$(RM) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a \ $(RM) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a \
@ -104,36 +87,31 @@ uninstall:
'$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXT) \ '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXT) \
'$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/api.h \ '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/api.h \
'$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc '$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc
rmdir '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter
.PHONY: all shared static install uninstall clean .PHONY: all install uninstall clean
##### Dev targets ##### ##### Dev targets #####
test: test:
cargo xtask fetch-fixtures script/fetch-fixtures
cargo xtask generate-fixtures script/generate-fixtures
cargo xtask test script/test
test-wasm: test_wasm:
cargo xtask generate-fixtures --wasm script/generate-fixtures-wasm
cargo xtask test-wasm script/test-wasm
lint: lint:
cargo update --workspace --locked --quiet cargo update --workspace --locked --quiet
cargo check --workspace --all-targets cargo check --workspace --all-targets
cargo fmt --all --check cargo +nightly fmt --all --check
cargo clippy --workspace --all-targets -- -D warnings cargo clippy --workspace --all-targets -- -D warnings
lint-web:
npm --prefix lib/binding_web ci
npm --prefix lib/binding_web run lint
format: format:
cargo fmt --all cargo +nightly fmt --all
changelog: changelog:
@git-cliff --config .github/cliff.toml --prepend CHANGELOG.md --latest --github-token $(shell gh auth token) @git-cliff --config script/cliff.toml --output CHANGELOG.md --latest --github-token $(shell gh auth token)
.PHONY: test test-wasm lint format changelog .PHONY: test test_wasm lint format changelog

View file

@ -14,22 +14,8 @@ let package = Package(
targets: [ targets: [
.target(name: "TreeSitter", .target(name: "TreeSitter",
path: "lib", path: "lib",
exclude: [ sources: ["src/lib.c"],
"src/unicode/ICU_SHA", cSettings: [.headerSearchPath("src")]),
"src/unicode/README.md",
"src/unicode/LICENSE",
"src/wasm/stdlib-symbols.txt",
"src/lib.c",
],
sources: ["src"],
publicHeadersPath: "include",
cSettings: [
.headerSearchPath("src"),
.define("_POSIX_C_SOURCE", to: "200112L"),
.define("_DEFAULT_SOURCE"),
.define("_BSD_SOURCE"),
.define("_DARWIN_C_SOURCE"),
]),
], ],
cLanguageStandard: .c11 cLanguageStandard: .c11
) )

View file

@ -14,8 +14,8 @@ Tree-sitter is a parser generator tool and an incremental parsing library. It ca
## Links ## Links
- [Documentation](https://tree-sitter.github.io) - [Documentation](https://tree-sitter.github.io)
- [Rust binding](lib/binding_rust/README.md) - [Rust binding](lib/binding_rust/README.md)
- [Wasm binding](lib/binding_web/README.md) - [WASM binding](lib/binding_web/README.md)
- [Command-line interface](crates/cli/README.md) - [Command-line interface](cli/README.md)
[discord]: https://img.shields.io/discord/1063097320771698699?logo=discord&label=discord [discord]: https://img.shields.io/discord/1063097320771698699?logo=discord&label=discord
[matrix]: https://img.shields.io/matrix/tree-sitter-chat%3Amatrix.org?logo=matrix&label=matrix [matrix]: https://img.shields.io/matrix/tree-sitter-chat%3Amatrix.org?logo=matrix&label=matrix

136
build.zig
View file

@ -1,142 +1,18 @@
const std = @import("std"); const std = @import("std");
pub fn build(b: *std.Build) !void { pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{}); var lib = b.addStaticLibrary(.{
const optimize = b.standardOptimizeOption(.{});
const wasm = b.option(bool, "enable-wasm", "Enable Wasm support") orelse false;
const shared = b.option(bool, "build-shared", "Build a shared library") orelse false;
const amalgamated = b.option(bool, "amalgamated", "Build using an amalgamated source") orelse false;
const lib: *std.Build.Step.Compile = b.addLibrary(.{
.name = "tree-sitter", .name = "tree-sitter",
.linkage = if (shared) .dynamic else .static, .target = b.standardTargetOptions(.{}),
.root_module = b.createModule(.{ .optimize = b.standardOptimizeOption(.{}),
.target = target,
.optimize = optimize,
.link_libc = true,
.pic = if (shared) true else null,
}),
}); });
if (amalgamated) { lib.linkLibC();
lib.addCSourceFile(.{ lib.addCSourceFile(.{ .file = b.path("lib/src/lib.c"), .flags = &.{"-std=c11"} });
.file = b.path("lib/src/lib.c"),
.flags = &.{"-std=c11"},
});
} else {
const files = try findSourceFiles(b);
defer b.allocator.free(files);
lib.addCSourceFiles(.{
.root = b.path("lib/src"),
.files = files,
.flags = &.{"-std=c11"},
});
}
lib.addIncludePath(b.path("lib/include")); lib.addIncludePath(b.path("lib/include"));
lib.addIncludePath(b.path("lib/src")); lib.addIncludePath(b.path("lib/src"));
lib.addIncludePath(b.path("lib/src/wasm"));
lib.root_module.addCMacro("_POSIX_C_SOURCE", "200112L");
lib.root_module.addCMacro("_DEFAULT_SOURCE", "");
lib.root_module.addCMacro("_BSD_SOURCE", "");
lib.root_module.addCMacro("_DARWIN_C_SOURCE", "");
if (wasm) {
if (b.lazyDependency(wasmtimeDep(target.result), .{})) |wasmtime| {
lib.root_module.addCMacro("TREE_SITTER_FEATURE_WASM", "");
lib.addSystemIncludePath(wasmtime.path("include"));
lib.addLibraryPath(wasmtime.path("lib"));
if (shared) lib.linkSystemLibrary("wasmtime");
}
}
lib.installHeadersDirectory(b.path("lib/include"), ".", .{}); lib.installHeadersDirectory(b.path("lib/include"), ".", .{});
b.installArtifact(lib); b.installArtifact(lib);
} }
/// Get the name of the wasmtime dependency for this target.
pub fn wasmtimeDep(target: std.Target) []const u8 {
const arch = target.cpu.arch;
const os = target.os.tag;
const abi = target.abi;
return @as(?[]const u8, switch (os) {
.linux => switch (arch) {
.x86_64 => switch (abi) {
.gnu => "wasmtime_c_api_x86_64_linux",
.musl => "wasmtime_c_api_x86_64_musl",
.android => "wasmtime_c_api_x86_64_android",
else => null,
},
.aarch64 => switch (abi) {
.gnu => "wasmtime_c_api_aarch64_linux",
.musl => "wasmtime_c_api_aarch64_musl",
.android => "wasmtime_c_api_aarch64_android",
else => null,
},
.x86 => switch (abi) {
.gnu => "wasmtime_c_api_i686_linux",
else => null,
},
.arm => switch (abi) {
.gnueabi => "wasmtime_c_api_armv7_linux",
else => null,
},
.s390x => switch (abi) {
.gnu => "wasmtime_c_api_s390x_linux",
else => null,
},
.riscv64 => switch (abi) {
.gnu => "wasmtime_c_api_riscv64gc_linux",
else => null,
},
else => null,
},
.windows => switch (arch) {
.x86_64 => switch (abi) {
.gnu => "wasmtime_c_api_x86_64_mingw",
.msvc => "wasmtime_c_api_x86_64_windows",
else => null,
},
.aarch64 => switch (abi) {
.msvc => "wasmtime_c_api_aarch64_windows",
else => null,
},
.x86 => switch (abi) {
.msvc => "wasmtime_c_api_i686_windows",
else => null,
},
else => null,
},
.macos => switch (arch) {
.x86_64 => "wasmtime_c_api_x86_64_macos",
.aarch64 => "wasmtime_c_api_aarch64_macos",
else => null,
},
else => null,
}) orelse std.debug.panic(
"Unsupported target for wasmtime: {s}-{s}-{s}",
.{ @tagName(arch), @tagName(os), @tagName(abi) },
);
}
fn findSourceFiles(b: *std.Build) ![]const []const u8 {
var sources: std.ArrayListUnmanaged([]const u8) = .empty;
var dir = try b.build_root.handle.openDir("lib/src", .{ .iterate = true });
var iter = dir.iterate();
defer dir.close();
while (try iter.next()) |entry| {
if (entry.kind != .file) continue;
const file = entry.name;
const ext = std.fs.path.extension(file);
if (std.mem.eql(u8, ext, ".c") and !std.mem.eql(u8, file, "lib.c")) {
try sources.append(b.allocator, b.dupe(file));
}
}
return sources.toOwnedSlice(b.allocator);
}

View file

@ -1,96 +1,10 @@
.{ .{
.name = .tree_sitter, .name = "tree-sitter",
.fingerprint = 0x841224b447ac0d4f, .version = "0.23.0",
.version = "0.27.0",
.minimum_zig_version = "0.14.1",
.paths = .{ .paths = .{
"build.zig", "build.zig",
"build.zig.zon", "build.zig.zon",
"lib/src", "lib/src",
"lib/include", "lib/include",
"README.md",
"LICENSE",
},
.dependencies = .{
.wasmtime_c_api_aarch64_android = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-android-c-api.tar.xz",
.hash = "N-V-__8AAIfPIgdw2YnV3QyiFQ2NHdrxrXzzCdjYJyxJDOta",
.lazy = true,
},
.wasmtime_c_api_aarch64_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-linux-c-api.tar.xz",
.hash = "N-V-__8AAIt97QZi7Pf7nNJ2mVY6uxA80Klyuvvtop3pLMRK",
.lazy = true,
},
.wasmtime_c_api_aarch64_macos = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-macos-c-api.tar.xz",
.hash = "N-V-__8AAAO48QQf91w9RmmUDHTja8DrXZA1n6Bmc8waW3qe",
.lazy = true,
},
.wasmtime_c_api_aarch64_musl = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-musl-c-api.tar.xz",
.hash = "N-V-__8AAI196wa9pwADoA2RbCDp5F7bKQg1iOPq6gIh8-FH",
.lazy = true,
},
.wasmtime_c_api_aarch64_windows = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-windows-c-api.zip",
.hash = "N-V-__8AAC9u4wXfqd1Q6XyQaC8_DbQZClXux60Vu5743N05",
.lazy = true,
},
.wasmtime_c_api_armv7_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-armv7-linux-c-api.tar.xz",
.hash = "N-V-__8AAHXe8gWs3s83Cc5G6SIq0_jWxj8fGTT5xG4vb6-x",
.lazy = true,
},
.wasmtime_c_api_i686_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-i686-linux-c-api.tar.xz",
.hash = "N-V-__8AAN2pzgUUfulRCYnipSfis9IIYHoTHVlieLRmKuct",
.lazy = true,
},
.wasmtime_c_api_i686_windows = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-i686-windows-c-api.zip",
.hash = "N-V-__8AAJu0YAUUTFBLxFIOi-MSQVezA6MMkpoFtuaf2Quf",
.lazy = true,
},
.wasmtime_c_api_riscv64gc_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-riscv64gc-linux-c-api.tar.xz",
.hash = "N-V-__8AAG8m-gc3E3AIImtTZ3l1c7HC6HUWazQ9OH5KACX4",
.lazy = true,
},
.wasmtime_c_api_s390x_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-s390x-linux-c-api.tar.xz",
.hash = "N-V-__8AAH314gd-gE4IBp2uvAL3gHeuW1uUZjMiLLeUdXL_",
.lazy = true,
},
.wasmtime_c_api_x86_64_android = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-android-c-api.tar.xz",
.hash = "N-V-__8AAIPNRwfNkznebrcGb0IKUe7f35bkuZEYOjcx6q3f",
.lazy = true,
},
.wasmtime_c_api_x86_64_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-linux-c-api.tar.xz",
.hash = "N-V-__8AAI8EDwcyTtk_Afhk47SEaqfpoRqGkJeZpGs69ChF",
.lazy = true,
},
.wasmtime_c_api_x86_64_macos = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-macos-c-api.tar.xz",
.hash = "N-V-__8AAGtGNgVaOpHSxC22IjrampbRIy6lLwscdcAE8nG1",
.lazy = true,
},
.wasmtime_c_api_x86_64_mingw = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-mingw-c-api.zip",
.hash = "N-V-__8AAPS2PAbVix50L6lnddlgazCPTz3whLUFk1qnRtnZ",
.lazy = true,
},
.wasmtime_c_api_x86_64_musl = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-musl-c-api.tar.xz",
.hash = "N-V-__8AAF-WEQe0nzvi09PgusM5i46FIuCKJmIDWUleWgQ3",
.lazy = true,
},
.wasmtime_c_api_x86_64_windows = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-windows-c-api.zip",
.hash = "N-V-__8AAKGNXwbpJQsn0_6kwSIVDDWifSg8cBzf7T2RzsC9",
.lazy = true,
},
}, },
} }

View file

@ -8,17 +8,9 @@ rust-version.workspace = true
readme = "README.md" readme = "README.md"
homepage.workspace = true homepage.workspace = true
repository.workspace = true repository.workspace = true
documentation = "https://docs.rs/tree-sitter-cli"
license.workspace = true license.workspace = true
keywords.workspace = true keywords.workspace = true
categories.workspace = true categories.workspace = true
include = ["build.rs", "README.md", "LICENSE", "benches/*", "src/**"]
[lints]
workspace = true
[lib]
path = "src/tree_sitter_cli.rs"
[[bin]] [[bin]]
name = "tree-sitter" name = "tree-sitter"
@ -30,52 +22,51 @@ name = "benchmark"
harness = false harness = false
[features] [features]
default = ["qjs-rt"]
wasm = ["tree-sitter/wasm", "tree-sitter-loader/wasm"] wasm = ["tree-sitter/wasm", "tree-sitter-loader/wasm"]
qjs-rt = ["tree-sitter-generate/qjs-rt"]
[dependencies] [dependencies]
ansi_colours.workspace = true
anstyle.workspace = true anstyle.workspace = true
anyhow.workspace = true anyhow.workspace = true
bstr.workspace = true bstr.workspace = true
clap.workspace = true clap.workspace = true
clap_complete.workspace = true clap_complete.workspace = true
clap_complete_nushell.workspace = true
crc32fast.workspace = true
ctor.workspace = true ctor.workspace = true
ctrlc.workspace = true ctrlc.workspace = true
dialoguer.workspace = true dirs.workspace = true
filetime.workspace = true
glob.workspace = true glob.workspace = true
heck.workspace = true heck.workspace = true
html-escape.workspace = true html-escape.workspace = true
indexmap.workspace = true
indoc.workspace = true indoc.workspace = true
lazy_static.workspace = true
log.workspace = true log.workspace = true
memchr.workspace = true memchr.workspace = true
rand.workspace = true rand.workspace = true
regex.workspace = true regex.workspace = true
schemars.workspace = true regex-syntax.workspace = true
rustc-hash.workspace = true
semver.workspace = true semver.workspace = true
serde.workspace = true serde.workspace = true
serde_derive.workspace = true
serde_json.workspace = true serde_json.workspace = true
similar.workspace = true similar.workspace = true
streaming-iterator.workspace = true smallbitvec.workspace = true
thiserror.workspace = true
tiny_http.workspace = true tiny_http.workspace = true
walkdir.workspace = true walkdir.workspace = true
wasmparser.workspace = true wasmparser.workspace = true
webbrowser.workspace = true webbrowser.workspace = true
tree-sitter.workspace = true tree-sitter.workspace = true
tree-sitter-generate.workspace = true
tree-sitter-config.workspace = true tree-sitter-config.workspace = true
tree-sitter-highlight.workspace = true tree-sitter-highlight.workspace = true
tree-sitter-loader.workspace = true tree-sitter-loader.workspace = true
tree-sitter-tags.workspace = true tree-sitter-tags.workspace = true
[target."cfg(windows)".dependencies]
url = "2.5.2"
[dev-dependencies] [dev-dependencies]
encoding_rs = "0.8.35"
widestring = "1.2.1"
tree_sitter_proc_macro = { path = "src/tests/proc_macro", package = "tree-sitter-tests-proc-macro" } tree_sitter_proc_macro = { path = "src/tests/proc_macro", package = "tree-sitter-tests-proc-macro" }
tempfile.workspace = true tempfile.workspace = true

View file

@ -7,15 +7,14 @@
[npmjs.com]: https://www.npmjs.org/package/tree-sitter-cli [npmjs.com]: https://www.npmjs.org/package/tree-sitter-cli
[npmjs.com badge]: https://img.shields.io/npm/v/tree-sitter-cli.svg?color=%23BF4A4A [npmjs.com badge]: https://img.shields.io/npm/v/tree-sitter-cli.svg?color=%23BF4A4A
The Tree-sitter CLI allows you to develop, test, and use Tree-sitter grammars from the command line. It works on `MacOS`, The Tree-sitter CLI allows you to develop, test, and use Tree-sitter grammars from the command line. It works on MacOS, Linux, and Windows.
`Linux`, and `Windows`.
### Installation ### Installation
You can install the `tree-sitter-cli` with `cargo`: You can install the `tree-sitter-cli` with `cargo`:
```sh ```sh
cargo install --locked tree-sitter-cli cargo install tree-sitter-cli
``` ```
or with `npm`: or with `npm`:
@ -35,11 +34,9 @@ The `tree-sitter` binary itself has no dependencies, but specific commands have
### Commands ### Commands
* `generate` - The `tree-sitter generate` command will generate a Tree-sitter parser based on the grammar in the current * `generate` - The `tree-sitter generate` command will generate a Tree-sitter parser based on the grammar in the current working directory. See [the documentation] for more information.
working directory. See [the documentation] for more information.
* `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. * `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. See [the documentation] for more information.
See [the documentation] for more information.
* `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers. * `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers.

View file

@ -3,77 +3,70 @@ use std::{
env, fs, env, fs,
path::{Path, PathBuf}, path::{Path, PathBuf},
str, str,
sync::LazyLock,
time::Instant, time::Instant,
}; };
use anyhow::Context; use anyhow::Context;
use log::info; use lazy_static::lazy_static;
use tree_sitter::{Language, Parser, Query}; use tree_sitter::{Language, Parser, Query};
use tree_sitter_loader::{CompileConfig, Loader}; use tree_sitter_loader::{CompileConfig, Loader};
include!("../src/tests/helpers/dirs.rs"); include!("../src/tests/helpers/dirs.rs");
static LANGUAGE_FILTER: LazyLock<Option<String>> = lazy_static! {
LazyLock::new(|| env::var("TREE_SITTER_BENCHMARK_LANGUAGE_FILTER").ok()); static ref LANGUAGE_FILTER: Option<String> =
static EXAMPLE_FILTER: LazyLock<Option<String>> = env::var("TREE_SITTER_BENCHMARK_LANGUAGE_FILTER").ok();
LazyLock::new(|| env::var("TREE_SITTER_BENCHMARK_EXAMPLE_FILTER").ok()); static ref EXAMPLE_FILTER: Option<String> =
static REPETITION_COUNT: LazyLock<usize> = LazyLock::new(|| { env::var("TREE_SITTER_BENCHMARK_EXAMPLE_FILTER").ok();
env::var("TREE_SITTER_BENCHMARK_REPETITION_COUNT") static ref REPETITION_COUNT: usize = env::var("TREE_SITTER_BENCHMARK_REPETITION_COUNT")
.map(|s| s.parse::<usize>().unwrap()) .map(|s| s.parse::<usize>().unwrap())
.unwrap_or(5) .unwrap_or(5);
}); static ref TEST_LOADER: Loader = Loader::with_parser_lib_path(SCRATCH_DIR.clone());
static TEST_LOADER: LazyLock<Loader> = static ref EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR: BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)> = {
LazyLock::new(|| Loader::with_parser_lib_path(SCRATCH_DIR.clone())); fn process_dir(result: &mut BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, dir: &Path) {
if dir.join("grammar.js").exists() {
let relative_path = dir.strip_prefix(GRAMMARS_DIR.as_path()).unwrap();
let (example_paths, query_paths) =
result.entry(relative_path.to_owned()).or_default();
#[allow(clippy::type_complexity)] if let Ok(example_files) = fs::read_dir(dir.join("examples")) {
static EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR: LazyLock< example_paths.extend(example_files.filter_map(|p| {
BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, let p = p.unwrap().path();
> = LazyLock::new(|| { if p.is_file() {
fn process_dir(result: &mut BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, dir: &Path) { Some(p)
if dir.join("grammar.js").exists() { } else {
let relative_path = dir.strip_prefix(GRAMMARS_DIR.as_path()).unwrap(); None
let (example_paths, query_paths) = result.entry(relative_path.to_owned()).or_default(); }
}));
}
if let Ok(example_files) = fs::read_dir(dir.join("examples")) { if let Ok(query_files) = fs::read_dir(dir.join("queries")) {
example_paths.extend(example_files.filter_map(|p| { query_paths.extend(query_files.filter_map(|p| {
let p = p.unwrap().path(); let p = p.unwrap().path();
if p.is_file() { if p.is_file() {
Some(p) Some(p)
} else { } else {
None None
}
}));
}
} else {
for entry in fs::read_dir(dir).unwrap() {
let entry = entry.unwrap().path();
if entry.is_dir() {
process_dir(result, &entry);
} }
}));
}
if let Ok(query_files) = fs::read_dir(dir.join("queries")) {
query_paths.extend(query_files.filter_map(|p| {
let p = p.unwrap().path();
if p.is_file() {
Some(p)
} else {
None
}
}));
}
} else {
for entry in fs::read_dir(dir).unwrap() {
let entry = entry.unwrap().path();
if entry.is_dir() {
process_dir(result, &entry);
} }
} }
} }
}
let mut result = BTreeMap::new(); let mut result = BTreeMap::new();
process_dir(&mut result, &GRAMMARS_DIR); process_dir(&mut result, &GRAMMARS_DIR);
result result
}); };
}
fn main() { fn main() {
tree_sitter_cli::logger::init();
let max_path_length = EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR let max_path_length = EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR
.values() .values()
.flat_map(|(e, q)| { .flat_map(|(e, q)| {
@ -84,7 +77,7 @@ fn main() {
.max() .max()
.unwrap_or(0); .unwrap_or(0);
info!("Benchmarking with {} repetitions", *REPETITION_COUNT); eprintln!("Benchmarking with {} repetitions", *REPETITION_COUNT);
let mut parser = Parser::new(); let mut parser = Parser::new();
let mut all_normal_speeds = Vec::new(); let mut all_normal_speeds = Vec::new();
@ -101,11 +94,11 @@ fn main() {
} }
} }
info!("\nLanguage: {language_name}"); eprintln!("\nLanguage: {language_name}");
let language = get_language(language_path); let language = get_language(language_path);
parser.set_language(&language).unwrap(); parser.set_language(&language).unwrap();
info!(" Constructing Queries"); eprintln!(" Constructing Queries");
for path in query_paths { for path in query_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() { if let Some(filter) = EXAMPLE_FILTER.as_ref() {
if !path.to_str().unwrap().contains(filter.as_str()) { if !path.to_str().unwrap().contains(filter.as_str()) {
@ -115,12 +108,12 @@ fn main() {
parse(path, max_path_length, |source| { parse(path, max_path_length, |source| {
Query::new(&language, str::from_utf8(source).unwrap()) Query::new(&language, str::from_utf8(source).unwrap())
.with_context(|| format!("Query file path: {}", path.display())) .with_context(|| format!("Query file path: {path:?}"))
.expect("Failed to parse query"); .expect("Failed to parse query");
}); });
} }
info!(" Parsing Valid Code:"); eprintln!(" Parsing Valid Code:");
let mut normal_speeds = Vec::new(); let mut normal_speeds = Vec::new();
for example_path in example_paths { for example_path in example_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() { if let Some(filter) = EXAMPLE_FILTER.as_ref() {
@ -134,7 +127,7 @@ fn main() {
})); }));
} }
info!(" Parsing Invalid Code (mismatched languages):"); eprintln!(" Parsing Invalid Code (mismatched languages):");
let mut error_speeds = Vec::new(); let mut error_speeds = Vec::new();
for (other_language_path, (example_paths, _)) in for (other_language_path, (example_paths, _)) in
EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR.iter() EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR.iter()
@ -155,30 +148,30 @@ fn main() {
} }
if let Some((average_normal, worst_normal)) = aggregate(&normal_speeds) { if let Some((average_normal, worst_normal)) = aggregate(&normal_speeds) {
info!(" Average Speed (normal): {average_normal} bytes/ms"); eprintln!(" Average Speed (normal): {average_normal} bytes/ms");
info!(" Worst Speed (normal): {worst_normal} bytes/ms"); eprintln!(" Worst Speed (normal): {worst_normal} bytes/ms");
} }
if let Some((average_error, worst_error)) = aggregate(&error_speeds) { if let Some((average_error, worst_error)) = aggregate(&error_speeds) {
info!(" Average Speed (errors): {average_error} bytes/ms"); eprintln!(" Average Speed (errors): {average_error} bytes/ms");
info!(" Worst Speed (errors): {worst_error} bytes/ms"); eprintln!(" Worst Speed (errors): {worst_error} bytes/ms");
} }
all_normal_speeds.extend(normal_speeds); all_normal_speeds.extend(normal_speeds);
all_error_speeds.extend(error_speeds); all_error_speeds.extend(error_speeds);
} }
info!("\n Overall"); eprintln!("\n Overall");
if let Some((average_normal, worst_normal)) = aggregate(&all_normal_speeds) { if let Some((average_normal, worst_normal)) = aggregate(&all_normal_speeds) {
info!(" Average Speed (normal): {average_normal} bytes/ms"); eprintln!(" Average Speed (normal): {average_normal} bytes/ms");
info!(" Worst Speed (normal): {worst_normal} bytes/ms"); eprintln!(" Worst Speed (normal): {worst_normal} bytes/ms");
} }
if let Some((average_error, worst_error)) = aggregate(&all_error_speeds) { if let Some((average_error, worst_error)) = aggregate(&all_error_speeds) {
info!(" Average Speed (errors): {average_error} bytes/ms"); eprintln!(" Average Speed (errors): {average_error} bytes/ms");
info!(" Worst Speed (errors): {worst_error} bytes/ms"); eprintln!(" Worst Speed (errors): {worst_error} bytes/ms");
} }
info!(""); eprintln!();
} }
fn aggregate(speeds: &[usize]) -> Option<(usize, usize)> { fn aggregate(speeds: &[usize]) -> Option<(usize, usize)> {
@ -197,8 +190,14 @@ fn aggregate(speeds: &[usize]) -> Option<(usize, usize)> {
} }
fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) -> usize { fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) -> usize {
eprint!(
" {:width$}\t",
path.file_name().unwrap().to_str().unwrap(),
width = max_path_length
);
let source_code = fs::read(path) let source_code = fs::read(path)
.with_context(|| format!("Failed to read {}", path.display())) .with_context(|| format!("Failed to read {path:?}"))
.unwrap(); .unwrap();
let time = Instant::now(); let time = Instant::now();
for _ in 0..*REPETITION_COUNT { for _ in 0..*REPETITION_COUNT {
@ -207,9 +206,8 @@ fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) ->
let duration = time.elapsed() / (*REPETITION_COUNT as u32); let duration = time.elapsed() / (*REPETITION_COUNT as u32);
let duration_ns = duration.as_nanos(); let duration_ns = duration.as_nanos();
let speed = ((source_code.len() as u128) * 1_000_000) / duration_ns; let speed = ((source_code.len() as u128) * 1_000_000) / duration_ns;
info!( eprintln!(
" {:max_path_length$}\ttime {:>7.2} ms\t\tspeed {speed:>6} bytes/ms", "time {:>7.2} ms\t\tspeed {speed:>6} bytes/ms",
path.file_name().unwrap().to_str().unwrap(),
(duration_ns as f64) / 1e6, (duration_ns as f64) / 1e6,
); );
speed as usize speed as usize
@ -219,6 +217,6 @@ fn get_language(path: &Path) -> Language {
let src_path = GRAMMARS_DIR.join(path).join("src"); let src_path = GRAMMARS_DIR.join(path).join("src");
TEST_LOADER TEST_LOADER
.load_language_at_path(CompileConfig::new(&src_path, None, None)) .load_language_at_path(CompileConfig::new(&src_path, None, None))
.with_context(|| format!("Failed to load language at path {}", src_path.display())) .with_context(|| format!("Failed to load language at path {src_path:?}"))
.unwrap() .unwrap()
} }

142
cli/build.rs Normal file
View file

@ -0,0 +1,142 @@
use std::{
env,
ffi::OsStr,
fs,
path::{Path, PathBuf},
time::SystemTime,
};
fn main() {
if let Some(git_sha) = read_git_sha() {
println!("cargo:rustc-env=BUILD_SHA={git_sha}");
}
println!("cargo:rustc-check-cfg=cfg(sanitizing)");
println!("cargo:rustc-check-cfg=cfg(TREE_SITTER_EMBED_WASM_BINDING)");
if web_playground_files_present() {
println!("cargo:rustc-cfg=TREE_SITTER_EMBED_WASM_BINDING");
}
let build_time = SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs_f64();
println!("cargo:rustc-env=BUILD_TIME={build_time}");
#[cfg(any(
target_os = "linux",
target_os = "android",
target_os = "freebsd",
target_os = "openbsd",
target_os = "netbsd",
target_os = "dragonfly",
))]
{
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()).join("dynamic-symbols.txt");
std::fs::write(
&out_dir,
"{
ts_current_malloc;
ts_current_calloc;
ts_current_realloc;
ts_current_free;
};",
)
.unwrap();
println!(
"cargo:rustc-link-arg=-Wl,--dynamic-list={}",
out_dir.display()
);
}
}
fn web_playground_files_present() -> bool {
let paths = [
"../docs/assets/js/playground.js",
"../lib/binding_web/tree-sitter.js",
"../lib/binding_web/tree-sitter.wasm",
];
paths.iter().all(|p| Path::new(p).exists())
}
fn read_git_sha() -> Option<String> {
let mut repo_path = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
let mut git_path;
loop {
git_path = repo_path.join(".git");
if git_path.exists() {
break;
}
if !repo_path.pop() {
return None;
}
}
let git_dir_path;
if git_path.is_dir() {
git_dir_path = git_path;
} else if let Ok(git_path_content) = fs::read_to_string(&git_path) {
git_dir_path = repo_path.join(git_path_content.get("gitdir: ".len()..).unwrap().trim_end());
} else {
return None;
}
let git_head_path = git_dir_path.join("HEAD");
if let Some(path) = git_head_path.to_str() {
println!("cargo:rerun-if-changed={path}");
}
if let Ok(mut head_content) = fs::read_to_string(&git_head_path) {
if head_content.ends_with('\n') {
head_content.pop();
}
// If we're on a branch, read the SHA from the ref file.
if head_content.starts_with("ref: ") {
head_content.replace_range(0.."ref: ".len(), "");
let ref_filename = {
// Go to real non-worktree gitdir
let git_dir_path = git_dir_path
.parent()
.and_then(|p| {
p.file_name()
.map(|n| n == OsStr::new("worktrees"))
.and_then(|x| x.then(|| p.parent()))
})
.flatten()
.unwrap_or(&git_dir_path);
let file = git_dir_path.join(&head_content);
if file.is_file() {
file
} else {
let packed_refs = git_dir_path.join("packed-refs");
if let Ok(packed_refs_content) = fs::read_to_string(&packed_refs) {
for line in packed_refs_content.lines() {
if let Some((hash, r#ref)) = line.split_once(' ') {
if r#ref == head_content {
if let Some(path) = packed_refs.to_str() {
println!("cargo:rerun-if-changed={path}");
}
return Some(hash.to_string());
}
}
}
}
return None;
}
};
if let Some(path) = ref_filename.to_str() {
println!("cargo:rerun-if-changed={path}");
}
return fs::read_to_string(&ref_filename).ok();
}
// If we're on a detached commit, then the `HEAD` file itself contains the sha.
if head_content.len() == 40 {
return Some(head_content);
}
}
None
}

View file

@ -8,20 +8,12 @@ rust-version.workspace = true
readme = "README.md" readme = "README.md"
homepage.workspace = true homepage.workspace = true
repository.workspace = true repository.workspace = true
documentation = "https://docs.rs/tree-sitter-config"
license.workspace = true license.workspace = true
keywords.workspace = true keywords.workspace = true
categories.workspace = true categories.workspace = true
[lib]
path = "src/tree_sitter_config.rs"
[lints]
workspace = true
[dependencies] [dependencies]
etcetera.workspace = true anyhow.workspace = true
log.workspace = true dirs.workspace = true
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
thiserror.workspace = true

View file

@ -1,54 +1,10 @@
#![cfg_attr(not(any(test, doctest)), doc = include_str!("../README.md"))] #![doc = include_str!("../README.md")]
use std::{ use std::{env, fs, path::PathBuf};
env, fs,
path::{Path, PathBuf},
};
use etcetera::BaseStrategy as _; use anyhow::{anyhow, Context, Result};
use log::warn;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use thiserror::Error;
pub type ConfigResult<T> = Result<T, ConfigError>;
#[derive(Debug, Error)]
pub enum ConfigError {
#[error("Bad JSON config {0} -- {1}")]
ConfigRead(String, serde_json::Error),
#[error(transparent)]
HomeDir(#[from] etcetera::HomeDirError),
#[error(transparent)]
IO(IoError),
#[error(transparent)]
Serialization(#[from] serde_json::Error),
}
#[derive(Debug, Error)]
pub struct IoError {
pub error: std::io::Error,
pub path: Option<String>,
}
impl IoError {
fn new(error: std::io::Error, path: Option<&Path>) -> Self {
Self {
error,
path: path.map(|p| p.to_string_lossy().to_string()),
}
}
}
impl std::fmt::Display for IoError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.error)?;
if let Some(ref path) = self.path {
write!(f, " ({path})")?;
}
Ok(())
}
}
/// Holds the contents of tree-sitter's configuration file. /// Holds the contents of tree-sitter's configuration file.
/// ///
@ -65,7 +21,7 @@ pub struct Config {
} }
impl Config { impl Config {
pub fn find_config_file() -> ConfigResult<Option<PathBuf>> { pub fn find_config_file() -> Result<Option<PathBuf>> {
if let Ok(path) = env::var("TREE_SITTER_DIR") { if let Ok(path) = env::var("TREE_SITTER_DIR") {
let mut path = PathBuf::from(path); let mut path = PathBuf::from(path);
path.push("config.json"); path.push("config.json");
@ -82,28 +38,8 @@ impl Config {
return Ok(Some(xdg_path)); return Ok(Some(xdg_path));
} }
if cfg!(target_os = "macos") { let legacy_path = dirs::home_dir()
let legacy_apple_path = etcetera::base_strategy::Apple::new()? .ok_or_else(|| anyhow!("Cannot determine home directory"))?
.data_dir() // `$HOME/Library/Application Support/`
.join("tree-sitter")
.join("config.json");
if legacy_apple_path.is_file() {
let xdg_dir = xdg_path.parent().unwrap();
fs::create_dir_all(xdg_dir)
.map_err(|e| ConfigError::IO(IoError::new(e, Some(xdg_dir))))?;
fs::rename(&legacy_apple_path, &xdg_path).map_err(|e| {
ConfigError::IO(IoError::new(e, Some(legacy_apple_path.as_path())))
})?;
warn!(
"Your config.json file has been automatically migrated from \"{}\" to \"{}\"",
legacy_apple_path.display(),
xdg_path.display()
);
return Ok(Some(xdg_path));
}
}
let legacy_path = etcetera::home_dir()?
.join(".tree-sitter") .join(".tree-sitter")
.join("config.json"); .join("config.json");
if legacy_path.is_file() { if legacy_path.is_file() {
@ -113,9 +49,9 @@ impl Config {
Ok(None) Ok(None)
} }
fn xdg_config_file() -> ConfigResult<PathBuf> { fn xdg_config_file() -> Result<PathBuf> {
let xdg_path = etcetera::choose_base_strategy()? let xdg_path = dirs::config_dir()
.config_dir() .ok_or_else(|| anyhow!("Cannot determine config directory"))?
.join("tree-sitter") .join("tree-sitter")
.join("config.json"); .join("config.json");
Ok(xdg_path) Ok(xdg_path)
@ -127,10 +63,10 @@ impl Config {
/// - Location specified by the path parameter if provided /// - Location specified by the path parameter if provided
/// - `$TREE_SITTER_DIR/config.json`, if the `TREE_SITTER_DIR` environment variable is set /// - `$TREE_SITTER_DIR/config.json`, if the `TREE_SITTER_DIR` environment variable is set
/// - `tree-sitter/config.json` in your default user configuration directory, as determined by /// - `tree-sitter/config.json` in your default user configuration directory, as determined by
/// [`etcetera::choose_base_strategy`](https://docs.rs/etcetera/*/etcetera/#basestrategy) /// [`dirs::config_dir`](https://docs.rs/dirs/*/dirs/fn.config_dir.html)
/// - `$HOME/.tree-sitter/config.json` as a fallback from where tree-sitter _used_ to store /// - `$HOME/.tree-sitter/config.json` as a fallback from where tree-sitter _used_ to store
/// its configuration /// its configuration
pub fn load(path: Option<PathBuf>) -> ConfigResult<Self> { pub fn load(path: Option<PathBuf>) -> Result<Self> {
let location = if let Some(path) = path { let location = if let Some(path) = path {
path path
} else if let Some(path) = Self::find_config_file()? { } else if let Some(path) = Self::find_config_file()? {
@ -140,9 +76,9 @@ impl Config {
}; };
let content = fs::read_to_string(&location) let content = fs::read_to_string(&location)
.map_err(|e| ConfigError::IO(IoError::new(e, Some(location.as_path()))))?; .with_context(|| format!("Failed to read {}", &location.to_string_lossy()))?;
let config = serde_json::from_str(&content) let config = serde_json::from_str(&content)
.map_err(|e| ConfigError::ConfigRead(location.to_string_lossy().to_string(), e))?; .with_context(|| format!("Bad JSON config {}", &location.to_string_lossy()))?;
Ok(Self { location, config }) Ok(Self { location, config })
} }
@ -152,7 +88,7 @@ impl Config {
/// disk. /// disk.
/// ///
/// (Note that this is typically only done by the `tree-sitter init-config` command.) /// (Note that this is typically only done by the `tree-sitter init-config` command.)
pub fn initial() -> ConfigResult<Self> { pub fn initial() -> Result<Self> {
let location = if let Ok(path) = env::var("TREE_SITTER_DIR") { let location = if let Ok(path) = env::var("TREE_SITTER_DIR") {
let mut path = PathBuf::from(path); let mut path = PathBuf::from(path);
path.push("config.json"); path.push("config.json");
@ -165,20 +101,17 @@ impl Config {
} }
/// Saves this configuration to the file that it was originally loaded from. /// Saves this configuration to the file that it was originally loaded from.
pub fn save(&self) -> ConfigResult<()> { pub fn save(&self) -> Result<()> {
let json = serde_json::to_string_pretty(&self.config)?; let json = serde_json::to_string_pretty(&self.config)?;
let config_dir = self.location.parent().unwrap(); fs::create_dir_all(self.location.parent().unwrap())?;
fs::create_dir_all(config_dir) fs::write(&self.location, json)?;
.map_err(|e| ConfigError::IO(IoError::new(e, Some(config_dir))))?;
fs::write(&self.location, json)
.map_err(|e| ConfigError::IO(IoError::new(e, Some(self.location.as_path()))))?;
Ok(()) Ok(())
} }
/// Parses a component-specific configuration from the configuration file. The type `C` must /// Parses a component-specific configuration from the configuration file. The type `C` must
/// be [deserializable](https://docs.rs/serde/*/serde/trait.Deserialize.html) from a JSON /// be [deserializable](https://docs.rs/serde/*/serde/trait.Deserialize.html) from a JSON
/// object, and must only include the fields relevant to that component. /// object, and must only include the fields relevant to that component.
pub fn get<C>(&self) -> ConfigResult<C> pub fn get<C>(&self) -> Result<C>
where where
C: for<'de> Deserialize<'de>, C: for<'de> Deserialize<'de>,
{ {
@ -189,7 +122,7 @@ impl Config {
/// Adds a component-specific configuration to the configuration file. The type `C` must be /// Adds a component-specific configuration to the configuration file. The type `C` must be
/// [serializable](https://docs.rs/serde/*/serde/trait.Serialize.html) into a JSON object, and /// [serializable](https://docs.rs/serde/*/serde/trait.Serialize.html) into a JSON object, and
/// must only include the fields relevant to that component. /// must only include the fields relevant to that component.
pub fn add<C>(&mut self, config: C) -> ConfigResult<()> pub fn add<C>(&mut self, config: C) -> Result<()>
where where
C: Serialize, C: Serialize,
{ {

View file

@ -1,12 +1,12 @@
{ {
"name": "eslint-config-treesitter", "name": "eslint-config-treesitter",
"version": "1.0.2", "version": "1.0.0",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "eslint-config-treesitter", "name": "eslint-config-treesitter",
"version": "1.0.2", "version": "1.0.0",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"eslint-plugin-jsdoc": "^50.2.4" "eslint-plugin-jsdoc": "^50.2.4"
@ -128,9 +128,10 @@
} }
}, },
"node_modules/@eslint/plugin-kit": { "node_modules/@eslint/plugin-kit": {
"version": "0.2.3", "version": "0.2.0",
"resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.2.3.tgz", "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.2.0.tgz",
"integrity": "sha512-2b/g5hRmpbb1o4GnTZax9N9m0FXzz9OV42ZzI4rDDMDuHUqigAiQCEWChBWCY4ztAGVRjoWT19v0yMmc5/L5kA==", "integrity": "sha512-vH9PiIMMwvhCx31Af3HiGzsVNULDbyVkHXwlemn/B0TFj/00ho3y55efXrUZTfQipxoHC5u4xq6zblww1zm1Ig==",
"license": "Apache-2.0",
"peer": true, "peer": true,
"dependencies": { "dependencies": {
"levn": "^0.4.1" "levn": "^0.4.1"
@ -305,9 +306,9 @@
"peer": true "peer": true
}, },
"node_modules/brace-expansion": { "node_modules/brace-expansion": {
"version": "1.1.12", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
"integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
"license": "MIT", "license": "MIT",
"peer": true, "peer": true,
"dependencies": { "dependencies": {
@ -379,9 +380,10 @@
"peer": true "peer": true
}, },
"node_modules/cross-spawn": { "node_modules/cross-spawn": {
"version": "7.0.5", "version": "7.0.3",
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.5.tgz", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
"integrity": "sha512-ZVJrKKYunU38/76t0RMOulHOnUcbU9GbpWKAOZ0mhjr7CX6FVrH+4FrAapSOekrgFQ3f/8gwMEuIft0aKq6Hug==", "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
"license": "MIT",
"peer": true, "peer": true,
"dependencies": { "dependencies": {
"path-key": "^3.1.0", "path-key": "^3.1.0",
@ -805,9 +807,9 @@
"peer": true "peer": true
}, },
"node_modules/js-yaml": { "node_modules/js-yaml": {
"version": "4.1.1", "version": "4.1.0",
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
"license": "MIT", "license": "MIT",
"peer": true, "peer": true,
"dependencies": { "dependencies": {

View file

@ -4,8 +4,7 @@
"description": "Eslint configuration for Tree-sitter grammar files", "description": "Eslint configuration for Tree-sitter grammar files",
"repository": { "repository": {
"type": "git", "type": "git",
"url": "git+https://github.com/tree-sitter/tree-sitter.git", "url": "git+https://github.com/tree-sitter/tree-sitter.git"
"directory": "crates/cli/eslint"
}, },
"license": "MIT", "license": "MIT",
"author": "Amaan Qureshi <amaanq12@gmail.com>", "author": "Amaan Qureshi <amaanq12@gmail.com>",
@ -21,9 +20,5 @@
}, },
"peerDependencies": { "peerDependencies": {
"eslint": ">= 9" "eslint": ">= 9"
},
"scripts": {
"prepack": "cp ../../../LICENSE .",
"postpack": "rm LICENSE"
} }
} }

View file

@ -8,40 +8,30 @@ rust-version.workspace = true
readme = "README.md" readme = "README.md"
homepage.workspace = true homepage.workspace = true
repository.workspace = true repository.workspace = true
documentation = "https://docs.rs/tree-sitter-loader"
license.workspace = true license.workspace = true
keywords.workspace = true keywords.workspace = true
categories.workspace = true categories.workspace = true
[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]
[lib]
path = "src/loader.rs"
[lints]
workspace = true
[features] [features]
wasm = ["tree-sitter/wasm"] wasm = ["tree-sitter/wasm"]
# TODO: For backward compatibility these must be enabled by default,
# consider removing for the next semver incompatible release
default = ["tree-sitter-highlight", "tree-sitter-tags"] default = ["tree-sitter-highlight", "tree-sitter-tags"]
[dependencies] [dependencies]
anyhow.workspace = true
cc.workspace = true cc.workspace = true
etcetera.workspace = true dirs.workspace = true
fs4.workspace = true fs4.workspace = true
indoc.workspace = true indoc.workspace = true
libloading.workspace = true libloading.workspace = true
log.workspace = true
once_cell.workspace = true once_cell.workspace = true
path-slash.workspace = true
regex.workspace = true regex.workspace = true
semver.workspace = true
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
tempfile.workspace = true tempfile.workspace = true
thiserror.workspace = true
tree-sitter = { workspace = true } tree-sitter = {workspace = true}
tree-sitter-highlight = { workspace = true, optional = true } tree-sitter-highlight = {workspace = true, optional = true}
tree-sitter-tags = { workspace = true, optional = true } tree-sitter-tags = {workspace = true, optional = true}

View file

@ -7,4 +7,7 @@ fn main() {
"cargo:rustc-env=BUILD_HOST={}", "cargo:rustc-env=BUILD_HOST={}",
std::env::var("HOST").unwrap() std::env::var("HOST").unwrap()
); );
let emscripten_version = std::fs::read_to_string("emscripten-version").unwrap();
println!("cargo:rustc-env=EMSCRIPTEN_VERSION={emscripten_version}");
} }

View file

@ -0,0 +1 @@
3.1.64

1448
cli/loader/src/lib.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,6 @@ type PrecRightRule = { type: 'PREC_RIGHT'; content: Rule; value: number };
type PrecRule = { type: 'PREC'; content: Rule; value: number }; type PrecRule = { type: 'PREC'; content: Rule; value: number };
type Repeat1Rule = { type: 'REPEAT1'; content: Rule }; type Repeat1Rule = { type: 'REPEAT1'; content: Rule };
type RepeatRule = { type: 'REPEAT'; content: Rule }; type RepeatRule = { type: 'REPEAT'; content: Rule };
type ReservedRule = { type: 'RESERVED'; content: Rule; context_name: string };
type SeqRule = { type: 'SEQ'; members: Rule[] }; type SeqRule = { type: 'SEQ'; members: Rule[] };
type StringRule = { type: 'STRING'; value: string }; type StringRule = { type: 'STRING'; value: string };
type SymbolRule<Name extends string> = { type: 'SYMBOL'; name: Name }; type SymbolRule<Name extends string> = { type: 'SYMBOL'; name: Name };
@ -29,19 +28,12 @@ type Rule =
| PrecRule | PrecRule
| Repeat1Rule | Repeat1Rule
| RepeatRule | RepeatRule
| ReservedRule
| SeqRule | SeqRule
| StringRule | StringRule
| SymbolRule<string> | SymbolRule<string>
| TokenRule; | TokenRule;
declare class RustRegex { type RuleOrLiteral = Rule | RegExp | string;
value: string;
constructor(pattern: string);
}
type RuleOrLiteral = Rule | RegExp | RustRegex | string;
type GrammarSymbols<RuleName extends string> = { type GrammarSymbols<RuleName extends string> = {
[name in RuleName]: SymbolRule<name>; [name in RuleName]: SymbolRule<name>;
@ -50,7 +42,7 @@ type GrammarSymbols<RuleName extends string> = {
type RuleBuilder<RuleName extends string> = ( type RuleBuilder<RuleName extends string> = (
$: GrammarSymbols<RuleName>, $: GrammarSymbols<RuleName>,
previous?: Rule, previous: Rule,
) => RuleOrLiteral; ) => RuleOrLiteral;
type RuleBuilders< type RuleBuilders<
@ -113,7 +105,7 @@ interface Grammar<
* @param $ grammar rules * @param $ grammar rules
* @param previous array of externals from the base schema, if any * @param previous array of externals from the base schema, if any
* *
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers/4-external-scanners * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
*/ */
externals?: ( externals?: (
$: Record<string, SymbolRule<string>>, $: Record<string, SymbolRule<string>>,
@ -151,7 +143,7 @@ interface Grammar<
* *
* @param $ grammar rules * @param $ grammar rules
* *
* @see https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types * @see https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
*/ */
supertypes?: ( supertypes?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>, $: GrammarSymbols<RuleName | BaseGrammarRuleName>,
@ -164,20 +156,9 @@ interface Grammar<
* *
* @param $ grammar rules * @param $ grammar rules
* *
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers/3-writing-the-grammar#keyword-extraction * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction
*/ */
word?: ($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral; word?: ($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral;
/**
* Mapping of names to reserved word sets. The first reserved word set is the
* global word set, meaning it applies to every rule in every parse state.
* The other word sets can be used with the `reserved` function.
*/
reserved?: Record<
string,
($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral[]
>;
} }
type GrammarSchema<RuleName extends string> = { type GrammarSchema<RuleName extends string> = {
@ -262,7 +243,7 @@ declare function optional(rule: RuleOrLiteral): ChoiceRule;
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/ */
declare const prec: { declare const prec: {
(value: string | number, rule: RuleOrLiteral): PrecRule; (value: String | number, rule: RuleOrLiteral): PrecRule;
/** /**
* Marks the given rule as left-associative (and optionally applies a * Marks the given rule as left-associative (and optionally applies a
@ -278,7 +259,7 @@ declare const prec: {
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/ */
left(rule: RuleOrLiteral): PrecLeftRule; left(rule: RuleOrLiteral): PrecLeftRule;
left(value: string | number, rule: RuleOrLiteral): PrecLeftRule; left(value: String | number, rule: RuleOrLiteral): PrecLeftRule;
/** /**
* Marks the given rule as right-associative (and optionally applies a * Marks the given rule as right-associative (and optionally applies a
@ -294,7 +275,7 @@ declare const prec: {
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/ */
right(rule: RuleOrLiteral): PrecRightRule; right(rule: RuleOrLiteral): PrecRightRule;
right(value: string | number, rule: RuleOrLiteral): PrecRightRule; right(value: String | number, rule: RuleOrLiteral): PrecRightRule;
/** /**
* Marks the given rule with a numerical precedence which will be used to * Marks the given rule with a numerical precedence which will be used to
@ -311,7 +292,7 @@ declare const prec: {
* *
* @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html * @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html
*/ */
dynamic(value: string | number, rule: RuleOrLiteral): PrecDynamicRule; dynamic(value: String | number, rule: RuleOrLiteral): PrecDynamicRule;
}; };
/** /**
@ -331,15 +312,6 @@ declare function repeat(rule: RuleOrLiteral): RepeatRule;
*/ */
declare function repeat1(rule: RuleOrLiteral): Repeat1Rule; declare function repeat1(rule: RuleOrLiteral): Repeat1Rule;
/**
* Overrides the global reserved word set for a given rule. The word set name
* should be defined in the `reserved` field in the grammar.
*
* @param wordset name of the reserved word set
* @param rule rule that will use the reserved word set
*/
declare function reserved(wordset: string, rule: RuleOrLiteral): ReservedRule;
/** /**
* Creates a rule that matches any number of other rules, one after another. * Creates a rule that matches any number of other rules, one after another.
* It is analogous to simply writing multiple symbols next to each other * It is analogous to simply writing multiple symbols next to each other
@ -358,7 +330,7 @@ declare function sym<Name extends string>(name: Name): SymbolRule<Name>;
/** /**
* Marks the given rule as producing only a single token. Tree-sitter's * Marks the given rule as producing only a single token. Tree-sitter's
* default is to treat each string or RegExp literal in the grammar as a * default is to treat each String or RegExp literal in the grammar as a
* separate token. Each token is matched separately by the lexer and * separate token. Each token is matched separately by the lexer and
* returned as its own leaf node in the tree. The token function allows * returned as its own leaf node in the tree. The token function allows
* you to express a complex rule using the DSL functions (rather * you to express a complex rule using the DSL functions (rather

3
crates/cli/npm/install.js → cli/npm/install.js Normal file → Executable file
View file

@ -6,8 +6,7 @@ const http = require('http');
const https = require('https'); const https = require('https');
const packageJSON = require('./package.json'); const packageJSON = require('./package.json');
https.globalAgent.keepAlive = false; // Look to a results table in https://github.com/tree-sitter/tree-sitter/issues/2196
const matrix = { const matrix = {
platform: { platform: {
'darwin': { 'darwin': {

View file

@ -1,33 +1,24 @@
{ {
"name": "tree-sitter-cli", "name": "tree-sitter-cli",
"version": "0.27.0", "version": "0.23.0",
"author": { "author": "Max Brunsfeld",
"name": "Max Brunsfeld",
"email": "maxbrunsfeld@gmail.com"
},
"maintainers": [
{
"name": "Amaan Qureshi",
"email": "amaanq12@gmail.com"
}
],
"license": "MIT", "license": "MIT",
"repository": { "repository": {
"type": "git", "type": "git",
"url": "git+https://github.com/tree-sitter/tree-sitter.git", "url": "https://github.com/tree-sitter/tree-sitter.git"
"directory": "crates/cli/npm"
}, },
"description": "CLI for generating fast incremental parsers", "description": "CLI for generating fast incremental parsers",
"keywords": [ "keywords": [
"parser", "parser",
"lexer" "lexer"
], ],
"main": "lib/api/index.js",
"engines": { "engines": {
"node": ">=12.0.0" "node": ">=12.0.0"
}, },
"scripts": { "scripts": {
"install": "node install.js", "install": "node install.js",
"prepack": "cp ../../../LICENSE ../README.md .", "prepack": "cp ../../LICENSE ../README.md .",
"postpack": "rm LICENSE README.md" "postpack": "rm LICENSE README.md"
}, },
"bin": { "bin": {

View file

@ -40,11 +40,7 @@ extern "C" {
fn free(ptr: *mut c_void); fn free(ptr: *mut c_void);
} }
pub fn record<T>(f: impl FnOnce() -> T) -> T { pub fn record<T>(f: impl FnOnce() -> T) -> Result<T, String> {
record_checked(f).unwrap()
}
pub fn record_checked<T>(f: impl FnOnce() -> T) -> Result<T, String> {
RECORDER.with(|recorder| { RECORDER.with(|recorder| {
recorder.enabled.store(true, SeqCst); recorder.enabled.store(true, SeqCst);
recorder.allocation_count.store(0, SeqCst); recorder.allocation_count.store(0, SeqCst);
@ -97,49 +93,30 @@ fn record_dealloc(ptr: *mut c_void) {
}); });
} }
/// # Safety unsafe extern "C" fn ts_record_malloc(size: usize) -> *mut c_void {
///
/// The caller must ensure that the returned pointer is eventually
/// freed by calling `ts_record_free`.
#[must_use]
pub unsafe extern "C" fn ts_record_malloc(size: usize) -> *mut c_void {
let result = malloc(size); let result = malloc(size);
record_alloc(result); record_alloc(result);
result result
} }
/// # Safety unsafe extern "C" fn ts_record_calloc(count: usize, size: usize) -> *mut c_void {
///
/// The caller must ensure that the returned pointer is eventually
/// freed by calling `ts_record_free`.
#[must_use]
pub unsafe extern "C" fn ts_record_calloc(count: usize, size: usize) -> *mut c_void {
let result = calloc(count, size); let result = calloc(count, size);
record_alloc(result); record_alloc(result);
result result
} }
/// # Safety unsafe extern "C" fn ts_record_realloc(ptr: *mut c_void, size: usize) -> *mut c_void {
///
/// The caller must ensure that the returned pointer is eventually
/// freed by calling `ts_record_free`.
#[must_use]
pub unsafe extern "C" fn ts_record_realloc(ptr: *mut c_void, size: usize) -> *mut c_void {
let result = realloc(ptr, size); let result = realloc(ptr, size);
if ptr.is_null() { if ptr.is_null() {
record_alloc(result); record_alloc(result);
} else if !core::ptr::eq(ptr, result) { } else if ptr != result {
record_dealloc(ptr); record_dealloc(ptr);
record_alloc(result); record_alloc(result);
} }
result result
} }
/// # Safety unsafe extern "C" fn ts_record_free(ptr: *mut c_void) {
///
/// The caller must ensure that `ptr` was allocated by a previous call
/// to `ts_record_malloc`, `ts_record_calloc`, or `ts_record_realloc`.
pub unsafe extern "C" fn ts_record_free(ptr: *mut c_void) {
record_dealloc(ptr); record_dealloc(ptr);
free(ptr); free(ptr);
} }

View file

@ -23,7 +23,7 @@ pub fn check_consistent_sizes(tree: &Tree, input: &[u8]) {
let mut some_child_has_changes = false; let mut some_child_has_changes = false;
let mut actual_named_child_count = 0; let mut actual_named_child_count = 0;
for i in 0..node.child_count() { for i in 0..node.child_count() {
let child = node.child(i as u32).unwrap(); let child = node.child(i).unwrap();
assert!(child.start_byte() >= last_child_end_byte); assert!(child.start_byte() >= last_child_end_byte);
assert!(child.start_position() >= last_child_end_point); assert!(child.start_position() >= last_child_end_point);
check(child, line_offsets); check(child, line_offsets);

View file

@ -1,11 +1,6 @@
use std::{ use std::{collections::HashMap, env, fs, path::Path};
collections::HashMap,
env, fs,
path::{Path, PathBuf},
sync::LazyLock,
};
use log::{error, info}; use lazy_static::lazy_static;
use rand::Rng; use rand::Rng;
use regex::Regex; use regex::Regex;
use tree_sitter::{Language, Parser}; use tree_sitter::{Language, Parser};
@ -25,30 +20,19 @@ use crate::{
random::Rand, random::Rand,
}, },
parse::perform_edit, parse::perform_edit,
test::{parse_tests, strip_sexp_fields, DiffKey, TestDiff, TestEntry}, test::{parse_tests, print_diff, print_diff_key, strip_sexp_fields, TestEntry},
}; };
pub static LOG_ENABLED: LazyLock<bool> = LazyLock::new(|| env::var("TREE_SITTER_LOG").is_ok()); lazy_static! {
pub static ref LOG_ENABLED: bool = env::var("TREE_SITTER_LOG").is_ok();
pub static LOG_GRAPH_ENABLED: LazyLock<bool> = pub static ref LOG_GRAPH_ENABLED: bool = env::var("TREE_SITTER_LOG_GRAPHS").is_ok();
LazyLock::new(|| env::var("TREE_SITTER_LOG_GRAPHS").is_ok()); pub static ref LANGUAGE_FILTER: Option<String> = env::var("TREE_SITTER_LANGUAGE").ok();
pub static ref EXAMPLE_INCLUDE: Option<Regex> = regex_env_var("TREE_SITTER_EXAMPLE_INCLUDE");
pub static LANGUAGE_FILTER: LazyLock<Option<String>> = pub static ref EXAMPLE_EXCLUDE: Option<Regex> = regex_env_var("TREE_SITTER_EXAMPLE_EXCLUDE");
LazyLock::new(|| env::var("TREE_SITTER_LANGUAGE").ok()); pub static ref START_SEED: usize = new_seed();
pub static ref EDIT_COUNT: usize = int_env_var("TREE_SITTER_EDITS").unwrap_or(3);
pub static EXAMPLE_INCLUDE: LazyLock<Option<Regex>> = pub static ref ITERATION_COUNT: usize = int_env_var("TREE_SITTER_ITERATIONS").unwrap_or(10);
LazyLock::new(|| regex_env_var("TREE_SITTER_EXAMPLE_INCLUDE")); }
pub static EXAMPLE_EXCLUDE: LazyLock<Option<Regex>> =
LazyLock::new(|| regex_env_var("TREE_SITTER_EXAMPLE_EXCLUDE"));
pub static START_SEED: LazyLock<usize> = LazyLock::new(new_seed);
pub static EDIT_COUNT: LazyLock<usize> =
LazyLock::new(|| int_env_var("TREE_SITTER_EDITS").unwrap_or(3));
pub static ITERATION_COUNT: LazyLock<usize> =
LazyLock::new(|| int_env_var("TREE_SITTER_ITERATIONS").unwrap_or(10));
fn int_env_var(name: &'static str) -> Option<usize> { fn int_env_var(name: &'static str) -> Option<usize> {
env::var(name).ok().and_then(|e| e.parse().ok()) env::var(name).ok().and_then(|e| e.parse().ok())
@ -62,15 +46,13 @@ fn regex_env_var(name: &'static str) -> Option<Regex> {
pub fn new_seed() -> usize { pub fn new_seed() -> usize {
int_env_var("TREE_SITTER_SEED").unwrap_or_else(|| { int_env_var("TREE_SITTER_SEED").unwrap_or_else(|| {
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
let seed = rng.gen::<usize>(); rng.gen::<usize>()
info!("Seed: {seed}");
seed
}) })
} }
pub struct FuzzOptions { pub struct FuzzOptions {
pub skipped: Option<Vec<String>>, pub skipped: Option<Vec<String>>,
pub subdir: Option<PathBuf>, pub subdir: Option<String>,
pub edits: usize, pub edits: usize,
pub iterations: usize, pub iterations: usize,
pub include: Option<Regex>, pub include: Option<Regex>,
@ -109,12 +91,12 @@ pub fn fuzz_language_corpus(
let corpus_dir = grammar_dir.join(subdir).join("test").join("corpus"); let corpus_dir = grammar_dir.join(subdir).join("test").join("corpus");
if !corpus_dir.exists() || !corpus_dir.is_dir() { if !corpus_dir.exists() || !corpus_dir.is_dir() {
error!("No corpus directory found, ensure that you have a `test/corpus` directory in your grammar directory with at least one test file."); eprintln!("No corpus directory found, ensure that you have a `test/corpus` directory in your grammar directory with at least one test file.");
return; return;
} }
if std::fs::read_dir(&corpus_dir).unwrap().count() == 0 { if std::fs::read_dir(&corpus_dir).unwrap().count() == 0 {
error!("No corpus files found in `test/corpus`, ensure that you have at least one test file in your corpus directory."); eprintln!("No corpus files found in `test/corpus`, ensure that you have at least one test file in your corpus directory.");
return; return;
} }
@ -150,7 +132,7 @@ pub fn fuzz_language_corpus(
let dump_edits = env::var("TREE_SITTER_DUMP_EDITS").is_ok(); let dump_edits = env::var("TREE_SITTER_DUMP_EDITS").is_ok();
if log_seed { if log_seed {
info!(" start seed: {start_seed}"); println!(" start seed: {start_seed}");
} }
println!(); println!();
@ -164,7 +146,7 @@ pub fn fuzz_language_corpus(
println!(" {test_index}. {test_name}"); println!(" {test_index}. {test_name}");
let passed = allocations::record_checked(|| { let passed = allocations::record(|| {
let mut log_session = None; let mut log_session = None;
let mut parser = get_parser(&mut log_session, "log.html"); let mut parser = get_parser(&mut log_session, "log.html");
parser.set_language(language).unwrap(); parser.set_language(language).unwrap();
@ -183,8 +165,8 @@ pub fn fuzz_language_corpus(
if actual_output != test.output { if actual_output != test.output {
println!("Incorrect initial parse for {test_name}"); println!("Incorrect initial parse for {test_name}");
DiffKey::print(); print_diff_key();
println!("{}", TestDiff::new(&actual_output, &test.output)); print_diff(&actual_output, &test.output, true);
println!(); println!();
return false; return false;
} }
@ -192,7 +174,7 @@ pub fn fuzz_language_corpus(
true true
}) })
.unwrap_or_else(|e| { .unwrap_or_else(|e| {
error!("{e}"); eprintln!("Error: {e}");
false false
}); });
@ -208,7 +190,7 @@ pub fn fuzz_language_corpus(
for trial in 0..options.iterations { for trial in 0..options.iterations {
let seed = start_seed + trial; let seed = start_seed + trial;
let passed = allocations::record_checked(|| { let passed = allocations::record(|| {
let mut rand = Rand::new(seed); let mut rand = Rand::new(seed);
let mut log_session = None; let mut log_session = None;
let mut parser = get_parser(&mut log_session, "log.html"); let mut parser = get_parser(&mut log_session, "log.html");
@ -217,20 +199,19 @@ pub fn fuzz_language_corpus(
let mut input = test.input.clone(); let mut input = test.input.clone();
if options.log_graphs { if options.log_graphs {
info!("{}\n", String::from_utf8_lossy(&input)); eprintln!("{}\n", String::from_utf8_lossy(&input));
} }
// Perform a random series of edits and reparse. // Perform a random series of edits and reparse.
let edit_count = rand.unsigned(*EDIT_COUNT); let mut undo_stack = Vec::new();
let mut undo_stack = Vec::with_capacity(edit_count); for _ in 0..=rand.unsigned(*EDIT_COUNT) {
for _ in 0..=edit_count {
let edit = get_random_edit(&mut rand, &input); let edit = get_random_edit(&mut rand, &input);
undo_stack.push(invert_edit(&input, &edit)); undo_stack.push(invert_edit(&input, &edit));
perform_edit(&mut tree, &mut input, &edit).unwrap(); perform_edit(&mut tree, &mut input, &edit).unwrap();
} }
if log_seed { if log_seed {
info!(" {test_index}.{trial:<2} seed: {seed}"); println!(" {test_index}.{trial:<2} seed: {seed}");
} }
if dump_edits { if dump_edits {
@ -244,7 +225,7 @@ pub fn fuzz_language_corpus(
} }
if options.log_graphs { if options.log_graphs {
info!("{}\n", String::from_utf8_lossy(&input)); eprintln!("{}\n", String::from_utf8_lossy(&input));
} }
set_included_ranges(&mut parser, &input, test.template_delimiters); set_included_ranges(&mut parser, &input, test.template_delimiters);
@ -253,7 +234,7 @@ pub fn fuzz_language_corpus(
// Check that the new tree is consistent. // Check that the new tree is consistent.
check_consistent_sizes(&tree2, &input); check_consistent_sizes(&tree2, &input);
if let Err(message) = check_changed_ranges(&tree, &tree2, &input) { if let Err(message) = check_changed_ranges(&tree, &tree2, &input) {
error!("\nUnexpected scope change in seed {seed} with start seed {start_seed}\n{message}\n\n",); println!("\nUnexpected scope change in seed {seed} with start seed {start_seed}\n{message}\n\n",);
return false; return false;
} }
@ -262,7 +243,7 @@ pub fn fuzz_language_corpus(
perform_edit(&mut tree2, &mut input, &edit).unwrap(); perform_edit(&mut tree2, &mut input, &edit).unwrap();
} }
if options.log_graphs { if options.log_graphs {
info!("{}\n", String::from_utf8_lossy(&input)); eprintln!("{}\n", String::from_utf8_lossy(&input));
} }
set_included_ranges(&mut parser, &test.input, test.template_delimiters); set_included_ranges(&mut parser, &test.input, test.template_delimiters);
@ -276,8 +257,8 @@ pub fn fuzz_language_corpus(
if actual_output != test.output && !test.error { if actual_output != test.output && !test.error {
println!("Incorrect parse for {test_name} - seed {seed}"); println!("Incorrect parse for {test_name} - seed {seed}");
DiffKey::print(); print_diff_key();
println!("{}", TestDiff::new(&actual_output, &test.output)); print_diff(&actual_output, &test.output, true);
println!(); println!();
return false; return false;
} }
@ -285,13 +266,13 @@ pub fn fuzz_language_corpus(
// Check that the edited tree is consistent. // Check that the edited tree is consistent.
check_consistent_sizes(&tree3, &input); check_consistent_sizes(&tree3, &input);
if let Err(message) = check_changed_ranges(&tree2, &tree3, &input) { if let Err(message) = check_changed_ranges(&tree2, &tree3, &input) {
error!("Unexpected scope change in seed {seed} with start seed {start_seed}\n{message}\n\n"); println!("Unexpected scope change in seed {seed} with start seed {start_seed}\n{message}\n\n");
return false; return false;
} }
true true
}).unwrap_or_else(|e| { }).unwrap_or_else(|e| {
error!("{e}"); eprintln!("Error: {e}");
false false
}); });
@ -303,17 +284,17 @@ pub fn fuzz_language_corpus(
} }
if failure_count != 0 { if failure_count != 0 {
info!("{failure_count} {language_name} corpus tests failed fuzzing"); eprintln!("{failure_count} {language_name} corpus tests failed fuzzing");
} }
skipped.retain(|_, v| *v == 0); skipped.retain(|_, v| *v == 0);
if !skipped.is_empty() { if !skipped.is_empty() {
info!("Non matchable skip definitions:"); println!("Non matchable skip definitions:");
for k in skipped.keys() { for k in skipped.keys() {
info!(" {k}"); println!(" {k}");
} }
panic!("Non matchable skip definitions need to be removed"); panic!("Non matchable skip definitions needs to be removed");
} }
} }

View file

@ -20,8 +20,8 @@ impl Rand {
} }
pub fn words(&mut self, max_count: usize) -> Vec<u8> { pub fn words(&mut self, max_count: usize) -> Vec<u8> {
let mut result = Vec::new();
let word_count = self.unsigned(max_count); let word_count = self.unsigned(max_count);
let mut result = Vec::with_capacity(2 * word_count);
for i in 0..word_count { for i in 0..word_count {
if i > 0 { if i > 0 {
if self.unsigned(5) == 0 { if self.unsigned(5) == 0 {

View file

@ -3,13 +3,14 @@ use std::{
mem, mem,
}; };
use log::debug; use log::info;
use super::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap}; use super::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap};
use crate::{ use crate::generate::{
dedup::split_state_id_groups, dedup::split_state_id_groups,
grammars::{LexicalGrammar, SyntaxGrammar}, grammars::{LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor}, nfa::{CharacterSet, NfaCursor},
prepare_grammar::symbol_is_used,
rules::{Symbol, TokenSet}, rules::{Symbol, TokenSet},
tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}, tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable},
}; };
@ -43,17 +44,15 @@ pub fn build_lex_table(
let tokens = state let tokens = state
.terminal_entries .terminal_entries
.keys() .keys()
.copied()
.chain(state.reserved_words.iter())
.filter_map(|token| { .filter_map(|token| {
if token.is_terminal() { if token.is_terminal() {
if keywords.contains(&token) { if keywords.contains(token) {
syntax_grammar.word_token syntax_grammar.word_token
} else { } else {
Some(token) Some(*token)
} }
} else if token.is_eof() { } else if token.is_eof() {
Some(token) Some(*token)
} else { } else {
None None
} }
@ -95,6 +94,9 @@ pub fn build_lex_table(
let mut large_character_sets = Vec::new(); let mut large_character_sets = Vec::new();
for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() { for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() {
let symbol = Symbol::terminal(variable_ix); let symbol = Symbol::terminal(variable_ix);
if !symbol_is_used(&syntax_grammar.variables, symbol) {
continue;
}
builder.reset(); builder.reset();
builder.add_state_for_tokens(&TokenSet::from_iter([symbol])); builder.add_state_for_tokens(&TokenSet::from_iter([symbol]));
for state in &builder.table.states { for state in &builder.table.states {
@ -176,8 +178,9 @@ impl<'a> LexTableBuilder<'a> {
let (state_id, is_new) = self.add_state(nfa_states, eof_valid); let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
if is_new { if is_new {
debug!( info!(
"entry point state: {state_id}, tokens: {:?}", "entry point state: {}, tokens: {:?}",
state_id,
tokens tokens
.iter() .iter()
.map(|t| &self.lexical_grammar.variables[t.index].name) .map(|t| &self.lexical_grammar.variables[t.index].name)
@ -358,7 +361,9 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
&mut group_ids_by_state_id, &mut group_ids_by_state_id,
1, 1,
lex_states_differ, lex_states_differ,
) {} ) {
continue;
}
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len()); let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id { for state_ids in &state_ids_by_group_id {

View file

@ -1,21 +1,22 @@
use std::{ use std::{
cmp::Ordering, cmp::Ordering,
collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque}, collections::{BTreeMap, HashMap, HashSet, VecDeque},
fmt::Write,
hash::BuildHasherDefault, hash::BuildHasherDefault,
}; };
use anyhow::{anyhow, Result};
use indexmap::{map::Entry, IndexMap}; use indexmap::{map::Entry, IndexMap};
use log::warn;
use rustc_hash::FxHasher; use rustc_hash::FxHasher;
use serde::Serialize;
use thiserror::Error;
use super::{ use super::{
item::{ParseItem, ParseItemSet, ParseItemSetCore, ParseItemSetEntry}, item::{ParseItem, ParseItemSet, ParseItemSetCore},
item_set_builder::ParseItemSetBuilder, item_set_builder::ParseItemSetBuilder,
}; };
use crate::{ use crate::generate::{
grammars::{LexicalGrammar, PrecedenceEntry, ReservedWordSetId, SyntaxGrammar, VariableType}, grammars::{
InlinedProductionMap, LexicalGrammar, PrecedenceEntry, SyntaxGrammar, VariableType,
},
node_types::VariableInfo, node_types::VariableInfo,
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet}, rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
tables::{ tables::{
@ -65,208 +66,8 @@ struct ParseTableBuilder<'a> {
parse_table: ParseTable, parse_table: ParseTable,
} }
pub type BuildTableResult<T> = Result<T, ParseTableBuilderError>;
#[derive(Debug, Error, Serialize)]
pub enum ParseTableBuilderError {
#[error("Unresolved conflict for symbol sequence:\n\n{0}")]
Conflict(#[from] ConflictError),
#[error("Extra rules must have unambiguous endings. Conflicting rules: {0}")]
AmbiguousExtra(#[from] AmbiguousExtraError),
#[error(
"The non-terminal rule `{0}` is used in a non-terminal `extra` rule, which is not allowed."
)]
ImproperNonTerminalExtra(String),
#[error("State count `{0}` exceeds the max value {max}.", max=u16::MAX)]
StateCount(usize),
}
#[derive(Default, Debug, Serialize, Error)]
pub struct ConflictError {
pub symbol_sequence: Vec<String>,
pub conflicting_lookahead: String,
pub possible_interpretations: Vec<Interpretation>,
pub possible_resolutions: Vec<Resolution>,
}
#[derive(Default, Debug, Serialize, Error)]
pub struct Interpretation {
pub preceding_symbols: Vec<String>,
pub variable_name: String,
pub production_step_symbols: Vec<String>,
pub step_index: u32,
pub done: bool,
pub conflicting_lookahead: String,
pub precedence: Option<String>,
pub associativity: Option<String>,
}
#[derive(Debug, Serialize)]
pub enum Resolution {
Precedence { symbols: Vec<String> },
Associativity { symbols: Vec<String> },
AddConflict { symbols: Vec<String> },
}
#[derive(Debug, Serialize, Error)]
pub struct AmbiguousExtraError {
pub parent_symbols: Vec<String>,
}
impl std::fmt::Display for ConflictError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for symbol in &self.symbol_sequence {
write!(f, " {symbol}")?;
}
writeln!(f, " • {} …\n", self.conflicting_lookahead)?;
writeln!(f, "Possible interpretations:\n")?;
let mut interpretations = self
.possible_interpretations
.iter()
.map(|i| {
let line = i.to_string();
let prec_line = if let (Some(precedence), Some(associativity)) =
(&i.precedence, &i.associativity)
{
Some(format!(
"(precedence: {precedence}, associativity: {associativity})",
))
} else {
i.precedence
.as_ref()
.map(|precedence| format!("(precedence: {precedence})"))
};
(line, prec_line)
})
.collect::<Vec<_>>();
let max_interpretation_length = interpretations
.iter()
.map(|i| i.0.chars().count())
.max()
.unwrap();
interpretations.sort_unstable();
for (i, (line, prec_suffix)) in interpretations.into_iter().enumerate() {
write!(f, " {}:", i + 1).unwrap();
write!(f, "{line}")?;
if let Some(prec_suffix) = prec_suffix {
write!(
f,
"{:1$}",
"",
max_interpretation_length.saturating_sub(line.chars().count()) + 2
)?;
write!(f, "{prec_suffix}")?;
}
writeln!(f)?;
}
writeln!(f, "\nPossible resolutions:\n")?;
for (i, resolution) in self.possible_resolutions.iter().enumerate() {
writeln!(f, " {}: {resolution}", i + 1)?;
}
Ok(())
}
}
impl std::fmt::Display for Interpretation {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for symbol in &self.preceding_symbols {
write!(f, " {symbol}")?;
}
write!(f, " ({}", self.variable_name)?;
for (i, symbol) in self.production_step_symbols.iter().enumerate() {
if i == self.step_index as usize {
write!(f, "")?;
}
write!(f, " {symbol}")?;
}
write!(f, ")")?;
if self.done {
write!(f, " • {} …", self.conflicting_lookahead)?;
}
Ok(())
}
}
impl std::fmt::Display for Resolution {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Precedence { symbols } => {
write!(f, "Specify a higher precedence in ")?;
for (i, symbol) in symbols.iter().enumerate() {
if i > 0 {
write!(f, " and ")?;
}
write!(f, "`{symbol}`")?;
}
write!(f, " than in the other rules.")?;
}
Self::Associativity { symbols } => {
write!(f, "Specify a left or right associativity in ")?;
for (i, symbol) in symbols.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "`{symbol}`")?;
}
}
Self::AddConflict { symbols } => {
write!(f, "Add a conflict for these rules: ")?;
for (i, symbol) in symbols.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "`{symbol}`")?;
}
}
}
Ok(())
}
}
impl std::fmt::Display for AmbiguousExtraError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for (i, symbol) in self.parent_symbols.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{symbol}")?;
}
Ok(())
}
}
impl<'a> ParseTableBuilder<'a> { impl<'a> ParseTableBuilder<'a> {
fn new( fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
item_set_builder: ParseItemSetBuilder<'a>,
variable_info: &'a [VariableInfo],
) -> Self {
Self {
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
actual_conflicts: syntax_grammar.expected_conflicts.iter().cloned().collect(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
}
fn build(mut self) -> BuildTableResult<(ParseTable, Vec<ParseStateInfo<'a>>)> {
// Ensure that the empty alias sequence has index 0. // Ensure that the empty alias sequence has index 0.
self.parse_table self.parse_table
.production_infos .production_infos
@ -279,13 +80,10 @@ impl<'a> ParseTableBuilder<'a> {
self.add_parse_state( self.add_parse_state(
&Vec::new(), &Vec::new(),
&Vec::new(), &Vec::new(),
ParseItemSet { ParseItemSet::with(std::iter::once((
entries: vec![ParseItemSetEntry { ParseItem::start(),
item: ParseItem::start(), std::iter::once(&Symbol::end()).copied().collect(),
lookaheads: std::iter::once(Symbol::end()).collect(), ))),
following_reserved_word_set: ReservedWordSetId::default(),
}],
},
); );
// Compute the possible item sets for non-terminal extras. // Compute the possible item sets for non-terminal extras.
@ -301,35 +99,25 @@ impl<'a> ParseTableBuilder<'a> {
non_terminal_extra_item_sets_by_first_terminal non_terminal_extra_item_sets_by_first_terminal
.entry(production.first_symbol().unwrap()) .entry(production.first_symbol().unwrap())
.or_insert_with(ParseItemSet::default) .or_insert_with(ParseItemSet::default)
.insert(ParseItem { .insert(
variable_index: extra_non_terminal.index as u32, ParseItem {
production, variable_index: extra_non_terminal.index as u32,
step_index: 1, production,
has_preceding_inherited_fields: false, step_index: 1,
}) has_preceding_inherited_fields: false,
.lookaheads },
.insert(Symbol::end_of_nonterminal_extra()); &std::iter::once(&Symbol::end_of_nonterminal_extra())
.copied()
.collect(),
);
} }
} }
let non_terminal_sets_len = non_terminal_extra_item_sets_by_first_terminal.len();
self.non_terminal_extra_states
.reserve(non_terminal_sets_len);
self.parse_state_info_by_id.reserve(non_terminal_sets_len);
self.parse_table.states.reserve(non_terminal_sets_len);
self.parse_state_queue.reserve(non_terminal_sets_len);
// Add a state for each starting terminal of a non-terminal extra rule. // Add a state for each starting terminal of a non-terminal extra rule.
for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal { for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
if terminal.is_non_terminal() { self.non_terminal_extra_states
Err(ParseTableBuilderError::ImproperNonTerminalExtra( .push((terminal, self.parse_table.states.len()));
self.symbol_name(&terminal), self.add_parse_state(&Vec::new(), &Vec::new(), item_set);
))?;
}
// Add the parse state, and *then* push the terminal and the state id into the
// list of nonterminal extra states
let state_id = self.add_parse_state(&Vec::new(), &Vec::new(), item_set);
self.non_terminal_extra_states.push((terminal, state_id));
} }
while let Some(entry) = self.parse_state_queue.pop_front() { while let Some(entry) = self.parse_state_queue.pop_front() {
@ -346,21 +134,17 @@ impl<'a> ParseTableBuilder<'a> {
} }
if !self.actual_conflicts.is_empty() { if !self.actual_conflicts.is_empty() {
warn!( println!("Warning: unnecessary conflicts");
"unnecessary conflicts:\n {}", for conflict in &self.actual_conflicts {
&self println!(
.actual_conflicts " {}",
.iter() conflict
.map(|conflict| { .iter()
conflict .map(|symbol| format!("`{}`", self.symbol_name(symbol)))
.iter() .collect::<Vec<_>>()
.map(|symbol| format!("`{}`", self.symbol_name(symbol))) .join(", ")
.collect::<Vec<_>>() );
.join(", ") }
})
.collect::<Vec<_>>()
.join("\n ")
);
} }
Ok((self.parse_table, self.parse_state_info_by_id)) Ok((self.parse_table, self.parse_state_info_by_id))
@ -394,7 +178,6 @@ impl<'a> ParseTableBuilder<'a> {
external_lex_state_id: 0, external_lex_state_id: 0,
terminal_entries: IndexMap::default(), terminal_entries: IndexMap::default(),
nonterminal_entries: IndexMap::default(), nonterminal_entries: IndexMap::default(),
reserved_words: TokenSet::default(),
core_id, core_id,
}); });
self.parse_state_queue.push_back(ParseStateQueueEntry { self.parse_state_queue.push_back(ParseStateQueueEntry {
@ -413,7 +196,7 @@ impl<'a> ParseTableBuilder<'a> {
mut preceding_auxiliary_symbols: AuxiliarySymbolSequence, mut preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId, state_id: ParseStateId,
item_set: &ParseItemSet<'a>, item_set: &ParseItemSet<'a>,
) -> BuildTableResult<()> { ) -> Result<()> {
let mut terminal_successors = BTreeMap::new(); let mut terminal_successors = BTreeMap::new();
let mut non_terminal_successors = BTreeMap::new(); let mut non_terminal_successors = BTreeMap::new();
let mut lookaheads_with_conflicts = TokenSet::new(); let mut lookaheads_with_conflicts = TokenSet::new();
@ -421,18 +204,13 @@ impl<'a> ParseTableBuilder<'a> {
// Each item in the item set contributes to either or a Shift action or a Reduce // Each item in the item set contributes to either or a Shift action or a Reduce
// action in this state. // action in this state.
for ParseItemSetEntry { for (item, lookaheads) in &item_set.entries {
item,
lookaheads,
following_reserved_word_set: reserved_lookaheads,
} in &item_set.entries
{
// If the item is unfinished, then this state has a transition for the item's // If the item is unfinished, then this state has a transition for the item's
// next symbol. Advance the item to its next step and insert the resulting // next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set. // item into the successor item set.
if let Some(next_symbol) = item.symbol() { if let Some(next_symbol) = item.symbol() {
let mut successor = item.successor(); let mut successor = item.successor();
let successor_set = if next_symbol.is_non_terminal() { if next_symbol.is_non_terminal() {
let variable = &self.syntax_grammar.variables[next_symbol.index]; let variable = &self.syntax_grammar.variables[next_symbol.index];
// Keep track of where auxiliary non-terminals (repeat symbols) are // Keep track of where auxiliary non-terminals (repeat symbols) are
@ -461,16 +239,13 @@ impl<'a> ParseTableBuilder<'a> {
non_terminal_successors non_terminal_successors
.entry(next_symbol) .entry(next_symbol)
.or_insert_with(ParseItemSet::default) .or_insert_with(ParseItemSet::default)
.insert(successor, lookaheads);
} else { } else {
terminal_successors terminal_successors
.entry(next_symbol) .entry(next_symbol)
.or_insert_with(ParseItemSet::default) .or_insert_with(ParseItemSet::default)
}; .insert(successor, lookaheads);
let successor_entry = successor_set.insert(successor); }
successor_entry.lookaheads.insert_all(lookaheads);
successor_entry.following_reserved_word_set = successor_entry
.following_reserved_word_set
.max(*reserved_lookaheads);
} }
// If the item is finished, then add a Reduce action to this state based // If the item is finished, then add a Reduce action to this state based
// on this item. // on this item.
@ -597,7 +372,7 @@ impl<'a> ParseTableBuilder<'a> {
)?; )?;
} }
// Add actions for the grammar's `extra` symbols. // Finally, add actions for the grammar's `extra` symbols.
let state = &mut self.parse_table.states[state_id]; let state = &mut self.parse_table.states[state_id];
let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra(); let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra();
@ -609,7 +384,7 @@ impl<'a> ParseTableBuilder<'a> {
let parent_symbols = item_set let parent_symbols = item_set
.entries .entries
.iter() .iter()
.filter_map(|ParseItemSetEntry { item, .. }| { .filter_map(|(item, _)| {
if !item.is_augmented() && item.step_index > 0 { if !item.is_augmented() && item.step_index > 0 {
Some(item.variable_index) Some(item.variable_index)
} else { } else {
@ -617,18 +392,15 @@ impl<'a> ParseTableBuilder<'a> {
} }
}) })
.collect::<HashSet<_>>(); .collect::<HashSet<_>>();
let parent_symbol_names = parent_symbols let mut message =
.iter() "Extra rules must have unambiguous endings. Conflicting rules: ".to_string();
.map(|&variable_index| { for (i, variable_index) in parent_symbols.iter().enumerate() {
self.syntax_grammar.variables[variable_index as usize] if i > 0 {
.name message += ", ";
.clone() }
}) message += &self.syntax_grammar.variables[*variable_index as usize].name;
.collect::<Vec<_>>(); }
return Err(anyhow!(message));
Err(AmbiguousExtraError {
parent_symbols: parent_symbol_names,
})?;
} }
} }
// Add actions for the start tokens of each non-terminal extra rule. // Add actions for the start tokens of each non-terminal extra rule.
@ -666,30 +438,6 @@ impl<'a> ParseTableBuilder<'a> {
} }
} }
if let Some(keyword_capture_token) = self.syntax_grammar.word_token {
let reserved_word_set_id = item_set
.entries
.iter()
.filter_map(|entry| {
if let Some(next_step) = entry.item.step() {
if next_step.symbol == keyword_capture_token {
Some(next_step.reserved_word_set_id)
} else {
None
}
} else if entry.lookaheads.contains(&keyword_capture_token) {
Some(entry.following_reserved_word_set)
} else {
None
}
})
.max();
if let Some(reserved_word_set_id) = reserved_word_set_id {
state.reserved_words =
self.syntax_grammar.reserved_word_sets[reserved_word_set_id.0].clone();
}
}
Ok(()) Ok(())
} }
@ -701,7 +449,7 @@ impl<'a> ParseTableBuilder<'a> {
preceding_auxiliary_symbols: &[AuxiliarySymbolInfo], preceding_auxiliary_symbols: &[AuxiliarySymbolInfo],
conflicting_lookahead: Symbol, conflicting_lookahead: Symbol,
reduction_info: &ReductionInfo, reduction_info: &ReductionInfo,
) -> BuildTableResult<()> { ) -> Result<()> {
let entry = self.parse_table.states[state_id] let entry = self.parse_table.states[state_id]
.terminal_entries .terminal_entries
.get_mut(&conflicting_lookahead) .get_mut(&conflicting_lookahead)
@ -715,11 +463,8 @@ impl<'a> ParseTableBuilder<'a> {
// precedence, and there can still be SHIFT/REDUCE conflicts. // precedence, and there can still be SHIFT/REDUCE conflicts.
let mut considered_associativity = false; let mut considered_associativity = false;
let mut shift_precedence = Vec::<(&Precedence, Symbol)>::new(); let mut shift_precedence = Vec::<(&Precedence, Symbol)>::new();
let mut conflicting_items = BTreeSet::new(); let mut conflicting_items = HashSet::new();
for ParseItemSetEntry { for (item, lookaheads) in &item_set.entries {
item, lookaheads, ..
} in &item_set.entries
{
if let Some(step) = item.step() { if let Some(step) = item.step() {
if item.step_index > 0 if item.step_index > 0
&& self && self
@ -856,55 +601,93 @@ impl<'a> ParseTableBuilder<'a> {
return Ok(()); return Ok(());
} }
let mut conflict_error = ConflictError::default(); let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols { for symbol in preceding_symbols {
conflict_error write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
.symbol_sequence
.push(self.symbol_name(symbol));
} }
conflict_error.conflicting_lookahead = self.symbol_name(&conflicting_lookahead);
let interpretations = conflicting_items writeln!(
&mut msg,
" • {} …\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
writeln!(&mut msg, "Possible interpretations:\n").unwrap();
let mut interpretations = conflicting_items
.iter() .iter()
.map(|item| { .map(|item| {
let preceding_symbols = preceding_symbols let mut line = String::new();
for preceding_symbol in preceding_symbols
.iter() .iter()
.take(preceding_symbols.len() - item.step_index as usize) .take(preceding_symbols.len() - item.step_index as usize)
.map(|symbol| self.symbol_name(symbol)) {
.collect::<Vec<_>>(); write!(&mut line, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
let variable_name = self.syntax_grammar.variables[item.variable_index as usize] write!(
.name &mut line,
.clone(); " ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
let production_step_symbols = item for (j, step) in item.production.steps.iter().enumerate() {
.production if j as u32 == item.step_index {
.steps write!(&mut line, "").unwrap();
.iter() }
.map(|step| self.symbol_name(&step.symbol)) write!(&mut line, " {}", self.symbol_name(&step.symbol)).unwrap();
.collect::<Vec<_>>(); }
let precedence = match item.precedence() { write!(&mut line, ")").unwrap();
Precedence::None => None,
_ => Some(item.precedence().to_string()), if item.is_done() {
write!(
&mut line,
" • {} …",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
let prec_line = if let Some(associativity) = associativity {
Some(format!(
"(precedence: {precedence}, associativity: {associativity:?})",
))
} else if !precedence.is_none() {
Some(format!("(precedence: {precedence})"))
} else {
None
}; };
let associativity = item.associativity().map(|assoc| format!("{assoc:?}")); (line, prec_line)
Interpretation {
preceding_symbols,
variable_name,
production_step_symbols,
step_index: item.step_index,
done: item.is_done(),
conflicting_lookahead: self.symbol_name(&conflicting_lookahead),
precedence,
associativity,
}
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
conflict_error.possible_interpretations = interpretations;
let max_interpretation_length = interpretations
.iter()
.map(|i| i.0.chars().count())
.max()
.unwrap();
interpretations.sort_unstable();
for (i, (line, prec_suffix)) in interpretations.into_iter().enumerate() {
write!(&mut msg, " {}:", i + 1).unwrap();
msg += &line;
if let Some(prec_suffix) = prec_suffix {
for _ in line.chars().count()..max_interpretation_length {
msg.push(' ');
}
msg += " ";
msg += &prec_suffix;
}
msg.push('\n');
}
let mut resolution_count = 0;
writeln!(&mut msg, "\nPossible resolutions:\n").unwrap();
let mut shift_items = Vec::new(); let mut shift_items = Vec::new();
let mut reduce_items = Vec::new(); let mut reduce_items = Vec::new();
for item in conflicting_items { for item in conflicting_items {
@ -917,57 +700,76 @@ impl<'a> ParseTableBuilder<'a> {
shift_items.sort_unstable(); shift_items.sort_unstable();
reduce_items.sort_unstable(); reduce_items.sort_unstable();
let get_rule_names = |items: &[&ParseItem]| -> Vec<String> { let list_rule_names = |mut msg: &mut String, items: &[&ParseItem]| {
let mut last_rule_id = None; let mut last_rule_id = None;
let mut result = Vec::with_capacity(items.len());
for item in items { for item in items {
if last_rule_id == Some(item.variable_index) { if last_rule_id == Some(item.variable_index) {
continue; continue;
} }
last_rule_id = Some(item.variable_index);
result.push(self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)));
}
result if last_rule_id.is_some() {
write!(&mut msg, " and").unwrap();
}
last_rule_id = Some(item.variable_index);
write!(
msg,
" `{}`",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
}; };
if actual_conflict.len() > 1 { if actual_conflict.len() > 1 {
if !shift_items.is_empty() { if !shift_items.is_empty() {
let names = get_rule_names(&shift_items); resolution_count += 1;
conflict_error write!(
.possible_resolutions &mut msg,
.push(Resolution::Precedence { symbols: names }); " {resolution_count}: Specify a higher precedence in",
)
.unwrap();
list_rule_names(&mut msg, &shift_items);
writeln!(&mut msg, " than in the other rules.").unwrap();
} }
for item in &reduce_items { for item in &reduce_items {
let name = self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)); resolution_count += 1;
conflict_error writeln!(
.possible_resolutions &mut msg,
.push(Resolution::Precedence { " {resolution_count}: Specify a higher precedence in `{}` than in the other rules.",
symbols: vec![name], self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
}); )
.unwrap();
} }
} }
if considered_associativity { if considered_associativity {
let names = get_rule_names(&reduce_items); resolution_count += 1;
conflict_error write!(
.possible_resolutions &mut msg,
.push(Resolution::Associativity { symbols: names }); " {resolution_count}: Specify a left or right associativity in",
)
.unwrap();
list_rule_names(&mut msg, &reduce_items);
writeln!(&mut msg).unwrap();
} }
conflict_error resolution_count += 1;
.possible_resolutions write!(
.push(Resolution::AddConflict { &mut msg,
symbols: actual_conflict " {resolution_count}: Add a conflict for these rules: ",
.iter() )
.map(|s| self.symbol_name(s)) .unwrap();
.collect(), for (i, symbol) in actual_conflict.iter().enumerate() {
}); if i > 0 {
write!(&mut msg, ", ").unwrap();
}
write!(&mut msg, "`{}`", self.symbol_name(symbol)).unwrap();
}
writeln!(&mut msg).unwrap();
self.actual_conflicts.insert(actual_conflict); Err(anyhow!(msg))
Err(conflict_error)?
} }
fn compare_precedence( fn compare_precedence(
@ -1036,7 +838,7 @@ impl<'a> ParseTableBuilder<'a> {
let parent_symbols = item_set let parent_symbols = item_set
.entries .entries
.iter() .iter()
.filter_map(|ParseItemSetEntry { item, .. }| { .filter_map(|(item, _)| {
let variable_index = item.variable_index as usize; let variable_index = item.variable_index as usize;
if item.symbol() == Some(symbol) if item.symbol() == Some(symbol)
&& !self.syntax_grammar.variables[variable_index].is_auxiliary() && !self.syntax_grammar.variables[variable_index].is_auxiliary()
@ -1124,24 +926,84 @@ impl<'a> ParseTableBuilder<'a> {
if variable.kind == VariableType::Named { if variable.kind == VariableType::Named {
variable.name.clone() variable.name.clone()
} else { } else {
format!("'{}'", variable.name) format!("'{}'", &variable.name)
} }
} }
} }
} }
} }
fn populate_following_tokens(
result: &mut [TokenSet],
grammar: &SyntaxGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) {
let productions = grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
}
}
}
}
for extra in &grammar.extra_symbols {
if extra.is_terminal() {
for entry in result.iter_mut() {
entry.insert(*extra);
}
result[extra.index].clone_from(&all_tokens);
}
}
}
pub fn build_parse_table<'a>( pub fn build_parse_table<'a>(
syntax_grammar: &'a SyntaxGrammar, syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar, lexical_grammar: &'a LexicalGrammar,
item_set_builder: ParseItemSetBuilder<'a>, inlines: &'a InlinedProductionMap,
variable_info: &'a [VariableInfo], variable_info: &'a [VariableInfo],
) -> BuildTableResult<(ParseTable, Vec<ParseStateInfo<'a>>)> { ) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
ParseTableBuilder::new( let actual_conflicts = syntax_grammar.expected_conflicts.iter().cloned().collect();
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
populate_following_tokens(
&mut following_tokens,
syntax_grammar,
inlines,
&item_set_builder,
);
let (table, item_sets) = ParseTableBuilder {
syntax_grammar, syntax_grammar,
lexical_grammar, lexical_grammar,
item_set_builder, item_set_builder,
variable_info, variable_info,
) non_terminal_extra_states: Vec::new(),
.build() actual_conflicts,
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
.build()?;
Ok((table, following_tokens, item_sets))
} }

View file

@ -1,6 +1,6 @@
use std::fmt; use std::fmt;
use crate::{ use crate::generate::{
grammars::LexicalGrammar, grammars::LexicalGrammar,
rules::Symbol, rules::Symbol,
tables::{ParseStateId, ParseTable}, tables::{ParseStateId, ParseTable},
@ -55,7 +55,7 @@ impl<'a> CoincidentTokenIndex<'a> {
} }
} }
impl fmt::Debug for CoincidentTokenIndex<'_> { impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "CoincidentTokenIndex {{")?; writeln!(f, "CoincidentTokenIndex {{")?;

View file

@ -2,31 +2,30 @@ use std::{
cmp::Ordering, cmp::Ordering,
fmt, fmt,
hash::{Hash, Hasher}, hash::{Hash, Hasher},
sync::LazyLock,
}; };
use crate::{ use lazy_static::lazy_static;
grammars::{
LexicalGrammar, Production, ProductionStep, ReservedWordSetId, SyntaxGrammar, use crate::generate::{
NO_RESERVED_WORDS, grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar},
},
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet}, rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
}; };
static START_PRODUCTION: LazyLock<Production> = LazyLock::new(|| Production { lazy_static! {
dynamic_precedence: 0, static ref START_PRODUCTION: Production = Production {
steps: vec![ProductionStep { dynamic_precedence: 0,
symbol: Symbol { steps: vec![ProductionStep {
index: 0, symbol: Symbol {
kind: SymbolType::NonTerminal, index: 0,
}, kind: SymbolType::NonTerminal,
precedence: Precedence::None, },
associativity: None, precedence: Precedence::None,
alias: None, associativity: None,
field_name: None, alias: None,
reserved_word_set_id: NO_RESERVED_WORDS, field_name: None,
}], }],
}); };
}
/// A [`ParseItem`] represents an in-progress match of a single production in a grammar. /// A [`ParseItem`] represents an in-progress match of a single production in a grammar.
#[derive(Clone, Copy, Debug)] #[derive(Clone, Copy, Debug)]
@ -59,14 +58,7 @@ pub struct ParseItem<'a> {
/// to a state in the final parse table. /// to a state in the final parse table.
#[derive(Clone, Debug, PartialEq, Eq, Default)] #[derive(Clone, Debug, PartialEq, Eq, Default)]
pub struct ParseItemSet<'a> { pub struct ParseItemSet<'a> {
pub entries: Vec<ParseItemSetEntry<'a>>, pub entries: Vec<(ParseItem<'a>, TokenSet)>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ParseItemSetEntry<'a> {
pub item: ParseItem<'a>,
pub lookaheads: TokenSet,
pub following_reserved_word_set: ReservedWordSetId,
} }
/// A [`ParseItemSetCore`] is like a [`ParseItemSet`], but without the lookahead /// A [`ParseItemSetCore`] is like a [`ParseItemSet`], but without the lookahead
@ -160,31 +152,35 @@ impl<'a> ParseItem<'a> {
} }
impl<'a> ParseItemSet<'a> { impl<'a> ParseItemSet<'a> {
pub fn insert(&mut self, item: ParseItem<'a>) -> &mut ParseItemSetEntry<'a> { pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, TokenSet)>) -> Self {
match self.entries.binary_search_by(|e| e.item.cmp(&item)) { let mut result = Self::default();
for (item, lookaheads) in elements {
result.insert(item, &lookaheads);
}
result
}
pub fn insert(&mut self, item: ParseItem<'a>, lookaheads: &TokenSet) -> &mut TokenSet {
match self.entries.binary_search_by(|(i, _)| i.cmp(&item)) {
Err(i) => { Err(i) => {
self.entries.insert( self.entries.insert(i, (item, lookaheads.clone()));
i, &mut self.entries[i].1
ParseItemSetEntry { }
item, Ok(i) => {
lookaheads: TokenSet::new(), self.entries[i].1.insert_all(lookaheads);
following_reserved_word_set: ReservedWordSetId::default(), &mut self.entries[i].1
},
);
&mut self.entries[i]
} }
Ok(i) => &mut self.entries[i],
} }
} }
pub fn core(&self) -> ParseItemSetCore<'a> { pub fn core(&self) -> ParseItemSetCore<'a> {
ParseItemSetCore { ParseItemSetCore {
entries: self.entries.iter().map(|e| e.item).collect(), entries: self.entries.iter().map(|e| e.0).collect(),
} }
} }
} }
impl fmt::Display for ParseItemDisplay<'_> { impl<'a> fmt::Display for ParseItemDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if self.0.is_augmented() { if self.0.is_augmented() {
write!(f, "START →")?; write!(f, "START →")?;
@ -192,42 +188,35 @@ impl fmt::Display for ParseItemDisplay<'_> {
write!( write!(
f, f,
"{} →", "{} →",
self.1.variables[self.0.variable_index as usize].name &self.1.variables[self.0.variable_index as usize].name
)?; )?;
} }
for (i, step) in self.0.production.steps.iter().enumerate() { for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize { if i == self.0.step_index as usize {
write!(f, "")?; write!(f, "")?;
if !step.precedence.is_none() if let Some(associativity) = step.associativity {
|| step.associativity.is_some() if step.precedence.is_none() {
|| step.reserved_word_set_id != ReservedWordSetId::default() write!(f, " ({associativity:?})")?;
{ } else {
write!(f, " (")?; write!(f, " ({} {associativity:?})", step.precedence)?;
if !step.precedence.is_none() {
write!(f, " {}", step.precedence)?;
} }
if let Some(associativity) = step.associativity { } else if !step.precedence.is_none() {
write!(f, " {associativity:?}")?; write!(f, " ({})", step.precedence)?;
}
if step.reserved_word_set_id != ReservedWordSetId::default() {
write!(f, "reserved: {}", step.reserved_word_set_id)?;
}
write!(f, " )")?;
} }
} }
write!(f, " ")?; write!(f, " ")?;
if step.symbol.is_terminal() { if step.symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(step.symbol.index) { if let Some(variable) = self.2.variables.get(step.symbol.index) {
write!(f, "{}", variable.name)?; write!(f, "{}", &variable.name)?;
} else { } else {
write!(f, "terminal-{}", step.symbol.index)?; write!(f, "terminal-{}", step.symbol.index)?;
} }
} else if step.symbol.is_external() { } else if step.symbol.is_external() {
write!(f, "{}", self.1.external_tokens[step.symbol.index].name)?; write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
} else { } else {
write!(f, "{}", self.1.variables[step.symbol.index].name)?; write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
} }
if let Some(alias) = &step.alias { if let Some(alias) = &step.alias {
@ -254,33 +243,7 @@ impl fmt::Display for ParseItemDisplay<'_> {
} }
} }
const fn escape_invisible(c: char) -> Option<&'static str> { impl<'a> fmt::Display for TokenSetDisplay<'a> {
Some(match c {
'\n' => "\\n",
'\r' => "\\r",
'\t' => "\\t",
'\0' => "\\0",
'\\' => "\\\\",
'\x0b' => "\\v",
'\x0c' => "\\f",
_ => return None,
})
}
fn display_variable_name(source: &str) -> String {
source
.chars()
.fold(String::with_capacity(source.len()), |mut acc, c| {
if let Some(esc) = escape_invisible(c) {
acc.push_str(esc);
} else {
acc.push(c);
}
acc
})
}
impl fmt::Display for TokenSetDisplay<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "[")?; write!(f, "[")?;
for (i, symbol) in self.0.iter().enumerate() { for (i, symbol) in self.0.iter().enumerate() {
@ -290,14 +253,14 @@ impl fmt::Display for TokenSetDisplay<'_> {
if symbol.is_terminal() { if symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(symbol.index) { if let Some(variable) = self.2.variables.get(symbol.index) {
write!(f, "{}", display_variable_name(&variable.name))?; write!(f, "{}", &variable.name)?;
} else { } else {
write!(f, "terminal-{}", symbol.index)?; write!(f, "terminal-{}", symbol.index)?;
} }
} else if symbol.is_external() { } else if symbol.is_external() {
write!(f, "{}", self.1.external_tokens[symbol.index].name)?; write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
} else { } else {
write!(f, "{}", self.1.variables[symbol.index].name)?; write!(f, "{}", &self.1.variables[symbol.index].name)?;
} }
} }
write!(f, "]")?; write!(f, "]")?;
@ -305,29 +268,21 @@ impl fmt::Display for TokenSetDisplay<'_> {
} }
} }
impl fmt::Display for ParseItemSetDisplay<'_> { impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for entry in &self.0.entries { for (item, lookaheads) in &self.0.entries {
write!( writeln!(
f, f,
"{}\t{}", "{}\t{}",
ParseItemDisplay(&entry.item, self.1, self.2), ParseItemDisplay(item, self.1, self.2),
TokenSetDisplay(&entry.lookaheads, self.1, self.2), TokenSetDisplay(lookaheads, self.1, self.2)
)?; )?;
if entry.following_reserved_word_set != ReservedWordSetId::default() {
write!(
f,
"\treserved word set: {}",
entry.following_reserved_word_set
)?;
}
writeln!(f)?;
} }
Ok(()) Ok(())
} }
} }
impl Hash for ParseItem<'_> { impl<'a> Hash for ParseItem<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) { fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_u32(self.variable_index); hasher.write_u32(self.variable_index);
hasher.write_u32(self.step_index); hasher.write_u32(self.step_index);
@ -341,7 +296,7 @@ impl Hash for ParseItem<'_> {
// this item, unless any of the following are true: // this item, unless any of the following are true:
// * the children have fields // * the children have fields
// * the children have aliases // * the children have aliases
// * the children are hidden and represent rules that have fields. // * the children are hidden and
// See the docs for `has_preceding_inherited_fields`. // See the docs for `has_preceding_inherited_fields`.
for step in &self.production.steps[0..self.step_index as usize] { for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher); step.alias.hash(hasher);
@ -356,7 +311,7 @@ impl Hash for ParseItem<'_> {
} }
} }
impl PartialEq for ParseItem<'_> { impl<'a> PartialEq for ParseItem<'a> {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
if self.variable_index != other.variable_index if self.variable_index != other.variable_index
|| self.step_index != other.step_index || self.step_index != other.step_index
@ -393,7 +348,7 @@ impl PartialEq for ParseItem<'_> {
} }
} }
impl Ord for ParseItem<'_> { impl<'a> Ord for ParseItem<'a> {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
self.step_index self.step_index
.cmp(&other.step_index) .cmp(&other.step_index)
@ -433,26 +388,25 @@ impl Ord for ParseItem<'_> {
} }
} }
impl PartialOrd for ParseItem<'_> { impl<'a> PartialOrd for ParseItem<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl Eq for ParseItem<'_> {} impl<'a> Eq for ParseItem<'a> {}
impl Hash for ParseItemSet<'_> { impl<'a> Hash for ParseItemSet<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) { fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len()); hasher.write_usize(self.entries.len());
for entry in &self.entries { for (item, lookaheads) in &self.entries {
entry.item.hash(hasher); item.hash(hasher);
entry.lookaheads.hash(hasher); lookaheads.hash(hasher);
entry.following_reserved_word_set.hash(hasher);
} }
} }
} }
impl Hash for ParseItemSetCore<'_> { impl<'a> Hash for ParseItemSetCore<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) { fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len()); hasher.write_usize(self.entries.len());
for item in &self.entries { for item in &self.entries {

View file

@ -3,9 +3,9 @@ use std::{
fmt, fmt,
}; };
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, ParseItemSetEntry, TokenSetDisplay}; use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSetDisplay};
use crate::{ use crate::generate::{
grammars::{InlinedProductionMap, LexicalGrammar, ReservedWordSetId, SyntaxGrammar}, grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
rules::{Symbol, SymbolType, TokenSet}, rules::{Symbol, SymbolType, TokenSet},
}; };
@ -15,10 +15,9 @@ struct TransitiveClosureAddition<'a> {
info: FollowSetInfo, info: FollowSetInfo,
} }
#[derive(Clone, Debug, Default, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
struct FollowSetInfo { struct FollowSetInfo {
lookaheads: TokenSet, lookaheads: TokenSet,
reserved_lookaheads: ReservedWordSetId,
propagates_lookaheads: bool, propagates_lookaheads: bool,
} }
@ -26,7 +25,6 @@ pub struct ParseItemSetBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar, syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar, lexical_grammar: &'a LexicalGrammar,
first_sets: HashMap<Symbol, TokenSet>, first_sets: HashMap<Symbol, TokenSet>,
reserved_first_sets: HashMap<Symbol, ReservedWordSetId>,
last_sets: HashMap<Symbol, TokenSet>, last_sets: HashMap<Symbol, TokenSet>,
inlines: &'a InlinedProductionMap, inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>, transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
@ -48,7 +46,6 @@ impl<'a> ParseItemSetBuilder<'a> {
syntax_grammar, syntax_grammar,
lexical_grammar, lexical_grammar,
first_sets: HashMap::new(), first_sets: HashMap::new(),
reserved_first_sets: HashMap::new(),
last_sets: HashMap::new(), last_sets: HashMap::new(),
inlines, inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
@ -57,7 +54,8 @@ impl<'a> ParseItemSetBuilder<'a> {
// For each grammar symbol, populate the FIRST and LAST sets: the set of // For each grammar symbol, populate the FIRST and LAST sets: the set of
// terminals that appear at the beginning and end that symbol's productions, // terminals that appear at the beginning and end that symbol's productions,
// respectively. // respectively.
// For a terminal symbol, the FIRST and LAST sets just consist of the //
// For a terminal symbol, the FIRST and LAST set just consists of the
// terminal itself. // terminal itself.
for i in 0..lexical_grammar.variables.len() { for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i); let symbol = Symbol::terminal(i);
@ -65,9 +63,6 @@ impl<'a> ParseItemSetBuilder<'a> {
set.insert(symbol); set.insert(symbol);
result.first_sets.insert(symbol, set.clone()); result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set); result.last_sets.insert(symbol, set);
result
.reserved_first_sets
.insert(symbol, ReservedWordSetId::default());
} }
for i in 0..syntax_grammar.external_tokens.len() { for i in 0..syntax_grammar.external_tokens.len() {
@ -76,15 +71,12 @@ impl<'a> ParseItemSetBuilder<'a> {
set.insert(symbol); set.insert(symbol);
result.first_sets.insert(symbol, set.clone()); result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set); result.last_sets.insert(symbol, set);
result
.reserved_first_sets
.insert(symbol, ReservedWordSetId::default());
} }
// The FIRST set of a non-terminal `i` is the union of the FIRST sets // The FIRST set of a non-terminal `i` is the union of the following sets:
// of all the symbols that appear at the beginnings of i's productions. Some // * the set of all terminals that appear at the beginnings of i's productions
// of these symbols may themselves be non-terminals, so this is a recursive // * the FIRST sets of all the non-terminals that appear at the beginnings of i's
// definition. // productions
// //
// Rather than computing these sets using recursion, we use an explicit stack // Rather than computing these sets using recursion, we use an explicit stack
// called `symbols_to_process`. // called `symbols_to_process`.
@ -92,36 +84,37 @@ impl<'a> ParseItemSetBuilder<'a> {
let mut processed_non_terminals = HashSet::new(); let mut processed_non_terminals = HashSet::new();
for i in 0..syntax_grammar.variables.len() { for i in 0..syntax_grammar.variables.len() {
let symbol = Symbol::non_terminal(i); let symbol = Symbol::non_terminal(i);
let first_set = result.first_sets.entry(symbol).or_default();
let reserved_first_set = result.reserved_first_sets.entry(symbol).or_default();
let first_set = result
.first_sets
.entry(symbol)
.or_insert_with(TokenSet::new);
processed_non_terminals.clear(); processed_non_terminals.clear();
symbols_to_process.clear(); symbols_to_process.clear();
symbols_to_process.push(symbol); symbols_to_process.push(symbol);
while let Some(sym) = symbols_to_process.pop() { while let Some(current_symbol) = symbols_to_process.pop() {
for production in &syntax_grammar.variables[sym.index].productions { if current_symbol.is_terminal() || current_symbol.is_external() {
if let Some(step) = production.steps.first() { first_set.insert(current_symbol);
if step.symbol.is_terminal() || step.symbol.is_external() { } else if processed_non_terminals.insert(current_symbol) {
first_set.insert(step.symbol); for production in &syntax_grammar.variables[current_symbol.index].productions {
} else if processed_non_terminals.insert(step.symbol) { if let Some(step) = production.steps.first() {
symbols_to_process.push(step.symbol); symbols_to_process.push(step.symbol);
} }
*reserved_first_set = (*reserved_first_set).max(step.reserved_word_set_id);
} }
} }
} }
// The LAST set is defined in a similar way to the FIRST set. // The LAST set is defined in a similar way to the FIRST set.
let last_set = result.last_sets.entry(symbol).or_default(); let last_set = result.last_sets.entry(symbol).or_insert_with(TokenSet::new);
processed_non_terminals.clear(); processed_non_terminals.clear();
symbols_to_process.clear(); symbols_to_process.clear();
symbols_to_process.push(symbol); symbols_to_process.push(symbol);
while let Some(sym) = symbols_to_process.pop() { while let Some(current_symbol) = symbols_to_process.pop() {
for production in &syntax_grammar.variables[sym.index].productions { if current_symbol.is_terminal() || current_symbol.is_external() {
if let Some(step) = production.steps.last() { last_set.insert(current_symbol);
if step.symbol.is_terminal() || step.symbol.is_external() { } else if processed_non_terminals.insert(current_symbol) {
last_set.insert(step.symbol); for production in &syntax_grammar.variables[current_symbol.index].productions {
} else if processed_non_terminals.insert(step.symbol) { if let Some(step) = production.steps.last() {
symbols_to_process.push(step.symbol); symbols_to_process.push(step.symbol);
} }
} }
@ -131,75 +124,67 @@ impl<'a> ParseItemSetBuilder<'a> {
// To compute an item set's transitive closure, we find each item in the set // To compute an item set's transitive closure, we find each item in the set
// whose next symbol is a non-terminal, and we add new items to the set for // whose next symbol is a non-terminal, and we add new items to the set for
// each of that symbol's productions. These productions might themselves begin // each of that symbols' productions. These productions might themselves begin
// with non-terminals, so the process continues recursively. In this process, // with non-terminals, so the process continues recursively. In this process,
// the total set of entries that get added depends only on two things: // the total set of entries that get added depends only on two things:
// // * the set of non-terminal symbols that occur at each item's current position
// * the non-terminal symbol that occurs next in each item // * the set of terminals that occurs after each of these non-terminal symbols
//
// * the set of terminals that can follow that non-terminal symbol in the item
// //
// So we can avoid a lot of duplicated recursive work by precomputing, for each // So we can avoid a lot of duplicated recursive work by precomputing, for each
// non-terminal symbol `i`, a final list of *additions* that must be made to an // non-terminal symbol `i`, a final list of *additions* that must be made to an
// item set when symbol `i` occurs as the next symbol in one if its core items. // item set when `i` occurs as the next symbol in one if its core items. The
// The structure of a precomputed *addition* is as follows: // structure of an *addition* is as follows:
// // * `item` - the new item that must be added as part of the expansion of `i`
// * `item` - the new item that must be added as part of the expansion of the symbol `i`. // * `lookaheads` - lookahead tokens that can always come after that item in the expansion
// // of `i`
// * `lookaheads` - the set of possible lookahead tokens that can always come after `item`
// in an expansion of symbol `i`.
//
// * `reserved_lookaheads` - the set of reserved lookahead lookahead tokens that can
// always come after `item` in the expansion of symbol `i`.
//
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the // * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the
// *end* of the expansion of symbol `i`, so that i's own current lookahead tokens can // *end* of the expansion of `i`, so that i's own current lookahead tokens can occur
// occur after `item`. // after `item`.
// //
// Rather than computing these additions recursively, we use an explicit stack. // Again, rather than computing these additions recursively, we use an explicit
let empty_lookaheads = TokenSet::new(); // stack called `entries_to_process`.
let mut stack = Vec::new();
let mut follow_set_info_by_non_terminal = HashMap::<usize, FollowSetInfo>::new();
for i in 0..syntax_grammar.variables.len() { for i in 0..syntax_grammar.variables.len() {
let empty_lookaheads = TokenSet::new();
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
// First, build up a map whose keys are all of the non-terminals that can // First, build up a map whose keys are all of the non-terminals that can
// appear at the beginning of non-terminal `i`, and whose values store // appear at the beginning of non-terminal `i`, and whose values store
// information about the tokens that can follow those non-terminals. // information about the tokens that can follow each non-terminal.
stack.clear(); let mut follow_set_info_by_non_terminal = HashMap::new();
stack.push((i, &empty_lookaheads, ReservedWordSetId::default(), true)); while let Some(entry) = entries_to_process.pop() {
follow_set_info_by_non_terminal.clear(); let (variable_index, lookaheads, propagates_lookaheads) = entry;
while let Some((sym_ix, lookaheads, reserved_word_set_id, propagates_lookaheads)) = let existing_info = follow_set_info_by_non_terminal
stack.pop() .entry(variable_index)
{ .or_insert_with(|| FollowSetInfo {
let mut did_add = false; lookaheads: TokenSet::new(),
let info = follow_set_info_by_non_terminal.entry(sym_ix).or_default(); propagates_lookaheads: false,
did_add |= info.lookaheads.insert_all(lookaheads); });
if reserved_word_set_id > info.reserved_lookaheads {
info.reserved_lookaheads = reserved_word_set_id; let did_add_follow_set_info;
did_add = true; if propagates_lookaheads {
} did_add_follow_set_info = !existing_info.propagates_lookaheads;
did_add |= propagates_lookaheads && !info.propagates_lookaheads; existing_info.propagates_lookaheads = true;
info.propagates_lookaheads |= propagates_lookaheads; } else {
if !did_add { did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
continue;
} }
for production in &syntax_grammar.variables[sym_ix].productions { if did_add_follow_set_info {
if let Some(symbol) = production.first_symbol() { for production in &syntax_grammar.variables[variable_index].productions {
if symbol.is_non_terminal() { if let Some(symbol) = production.first_symbol() {
if let Some(next_step) = production.steps.get(1) { if symbol.is_non_terminal() {
stack.push(( if production.steps.len() == 1 {
symbol.index, entries_to_process.push((
&result.first_sets[&next_step.symbol], symbol.index,
result.reserved_first_sets[&next_step.symbol], lookaheads,
false, propagates_lookaheads,
)); ));
} else { } else {
stack.push(( entries_to_process.push((
symbol.index, symbol.index,
lookaheads, &result.first_sets[&production.steps[1].symbol],
reserved_word_set_id, false,
propagates_lookaheads, ));
)); }
} }
} }
} }
@ -209,7 +194,7 @@ impl<'a> ParseItemSetBuilder<'a> {
// Store all of those non-terminals' productions, along with their associated // Store all of those non-terminals' productions, along with their associated
// lookahead info, as *additions* associated with non-terminal `i`. // lookahead info, as *additions* associated with non-terminal `i`.
let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (&variable_index, follow_set_info) in &follow_set_info_by_non_terminal { for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index]; let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index); let non_terminal = Symbol::non_terminal(variable_index);
let variable_index = variable_index as u32; let variable_index = variable_index as u32;
@ -254,23 +239,20 @@ impl<'a> ParseItemSetBuilder<'a> {
pub fn transitive_closure(&self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> { pub fn transitive_closure(&self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default(); let mut result = ParseItemSet::default();
for entry in &item_set.entries { for (item, lookaheads) in &item_set.entries {
if let Some(productions) = self if let Some(productions) = self
.inlines .inlines
.inlined_productions(entry.item.production, entry.item.step_index) .inlined_productions(item.production, item.step_index)
{ {
for production in productions { for production in productions {
self.add_item( self.add_item(
&mut result, &mut result,
&ParseItemSetEntry { item.substitute_production(production),
item: entry.item.substitute_production(production), lookaheads,
lookaheads: entry.lookaheads.clone(),
following_reserved_word_set: entry.following_reserved_word_set,
},
); );
} }
} else { } else {
self.add_item(&mut result, entry); self.add_item(&mut result, *item, lookaheads);
} }
} }
result result
@ -280,68 +262,34 @@ impl<'a> ParseItemSetBuilder<'a> {
&self.first_sets[symbol] &self.first_sets[symbol]
} }
pub fn reserved_first_set(&self, symbol: &Symbol) -> Option<&TokenSet> {
let id = *self.reserved_first_sets.get(symbol)?;
Some(&self.syntax_grammar.reserved_word_sets[id.0])
}
pub fn last_set(&self, symbol: &Symbol) -> &TokenSet { pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
&self.last_sets[symbol] &self.last_sets[symbol]
} }
fn add_item(&self, set: &mut ParseItemSet<'a>, entry: &ParseItemSetEntry<'a>) { fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
if let Some(step) = entry.item.step() { if let Some(step) = item.step() {
if step.symbol.is_non_terminal() { if step.symbol.is_non_terminal() {
let next_step = entry.item.successor().step(); let next_step = item.successor().step();
// Determine which tokens can follow this non-terminal. // Determine which tokens can follow this non-terminal.
let (following_tokens, following_reserved_tokens) = let following_tokens = next_step.map_or(lookaheads, |next_step| {
if let Some(next_step) = next_step { self.first_sets.get(&next_step.symbol).unwrap()
( });
self.first_sets.get(&next_step.symbol).unwrap(),
*self.reserved_first_sets.get(&next_step.symbol).unwrap(),
)
} else {
(&entry.lookaheads, entry.following_reserved_word_set)
};
// Use the pre-computed *additions* to expand the non-terminal. // Use the pre-computed *additions* to expand the non-terminal.
for addition in &self.transitive_closure_additions[step.symbol.index] { for addition in &self.transitive_closure_additions[step.symbol.index] {
let entry = set.insert(addition.item); let lookaheads = set.insert(addition.item, &addition.info.lookaheads);
entry.lookaheads.insert_all(&addition.info.lookaheads);
if let Some(word_token) = self.syntax_grammar.word_token {
if addition.info.lookaheads.contains(&word_token) {
entry.following_reserved_word_set = entry
.following_reserved_word_set
.max(addition.info.reserved_lookaheads);
}
}
if addition.info.propagates_lookaheads { if addition.info.propagates_lookaheads {
entry.lookaheads.insert_all(following_tokens); lookaheads.insert_all(following_tokens);
if let Some(word_token) = self.syntax_grammar.word_token {
if following_tokens.contains(&word_token) {
entry.following_reserved_word_set = entry
.following_reserved_word_set
.max(following_reserved_tokens);
}
}
} }
} }
} }
} }
set.insert(item, lookaheads);
let e = set.insert(entry.item);
e.lookaheads.insert_all(&entry.lookaheads);
e.following_reserved_word_set = e
.following_reserved_word_set
.max(entry.following_reserved_word_set);
} }
} }
impl fmt::Debug for ParseItemSetBuilder<'_> { impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "ParseItemSetBuilder {{")?; writeln!(f, "ParseItemSetBuilder {{")?;

View file

@ -3,15 +3,14 @@ use std::{
mem, mem,
}; };
use log::debug; use log::info;
use super::token_conflicts::TokenConflictMap; use super::token_conflicts::TokenConflictMap;
use crate::{ use crate::generate::{
dedup::split_state_id_groups, dedup::split_state_id_groups,
grammars::{LexicalGrammar, SyntaxGrammar, VariableType}, grammars::{LexicalGrammar, SyntaxGrammar, VariableType},
rules::{AliasMap, Symbol, TokenSet}, rules::{AliasMap, Symbol, TokenSet},
tables::{GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry}, tables::{GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry},
OptLevel,
}; };
pub fn minimize_parse_table( pub fn minimize_parse_table(
@ -21,7 +20,6 @@ pub fn minimize_parse_table(
simple_aliases: &AliasMap, simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap, token_conflict_map: &TokenConflictMap,
keywords: &TokenSet, keywords: &TokenSet,
optimizations: OptLevel,
) { ) {
let mut minimizer = Minimizer { let mut minimizer = Minimizer {
parse_table, parse_table,
@ -31,9 +29,7 @@ pub fn minimize_parse_table(
keywords, keywords,
simple_aliases, simple_aliases,
}; };
if optimizations.contains(OptLevel::MergeStates) { minimizer.merge_compatible_states();
minimizer.merge_compatible_states();
}
minimizer.remove_unit_reductions(); minimizer.remove_unit_reductions();
minimizer.remove_unused_states(); minimizer.remove_unused_states();
minimizer.reorder_states_by_descending_size(); minimizer.reorder_states_by_descending_size();
@ -48,7 +44,7 @@ struct Minimizer<'a> {
simple_aliases: &'a AliasMap, simple_aliases: &'a AliasMap,
} }
impl Minimizer<'_> { impl<'a> Minimizer<'a> {
fn remove_unit_reductions(&mut self) { fn remove_unit_reductions(&mut self) {
let mut aliased_symbols = HashSet::new(); let mut aliased_symbols = HashSet::new();
for variable in &self.syntax_grammar.variables { for variable in &self.syntax_grammar.variables {
@ -74,17 +70,18 @@ impl Minimizer<'_> {
production_id: 0, production_id: 0,
symbol, symbol,
.. ..
} if !self.simple_aliases.contains_key(symbol) } => {
&& !self.syntax_grammar.supertype_symbols.contains(symbol) if !self.simple_aliases.contains_key(symbol)
&& !self.syntax_grammar.extra_symbols.contains(symbol) && !self.syntax_grammar.supertype_symbols.contains(symbol)
&& !aliased_symbols.contains(symbol) && !aliased_symbols.contains(symbol)
&& self.syntax_grammar.variables[symbol.index].kind && self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named != VariableType::Named
&& (unit_reduction_symbol.is_none() && (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol)) => || unit_reduction_symbol == Some(symbol))
{ {
unit_reduction_symbol = Some(symbol); unit_reduction_symbol = Some(symbol);
continue; continue;
}
} }
_ => {} _ => {}
} }
@ -155,7 +152,9 @@ impl Minimizer<'_> {
&mut group_ids_by_state_id, &mut group_ids_by_state_id,
0, 0,
|left, right, groups| self.state_successors_differ(left, right, groups), |left, right, groups| self.state_successors_differ(left, right, groups),
) {} ) {
continue;
}
let error_group_index = state_ids_by_group_id let error_group_index = state_ids_by_group_id
.iter() .iter()
@ -172,12 +171,17 @@ impl Minimizer<'_> {
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len()); let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id { for state_ids in &state_ids_by_group_id {
// Initialize the new state based on the first old state in the group. // Initialize the new state based on the first old state in the group.
let mut parse_state = mem::take(&mut self.parse_table.states[state_ids[0]]); let mut parse_state = ParseState::default();
mem::swap(&mut parse_state, &mut self.parse_table.states[state_ids[0]]);
// Extend the new state with all of the actions from the other old states // Extend the new state with all of the actions from the other old states
// in the group. // in the group.
for state_id in &state_ids[1..] { for state_id in &state_ids[1..] {
let other_parse_state = mem::take(&mut self.parse_table.states[*state_id]); let mut other_parse_state = ParseState::default();
mem::swap(
&mut other_parse_state,
&mut self.parse_table.states[*state_id],
);
parse_state parse_state
.terminal_entries .terminal_entries
@ -185,12 +189,6 @@ impl Minimizer<'_> {
parse_state parse_state
.nonterminal_entries .nonterminal_entries
.extend(other_parse_state.nonterminal_entries); .extend(other_parse_state.nonterminal_entries);
parse_state
.reserved_words
.insert_all(&other_parse_state.reserved_words);
for symbol in parse_state.terminal_entries.keys() {
parse_state.reserved_words.remove(symbol);
}
} }
// Update the new state's outgoing references using the new grouping. // Update the new state's outgoing references using the new grouping.
@ -219,14 +217,24 @@ impl Minimizer<'_> {
) { ) {
return true; return true;
} }
} else if self.token_conflicts(left_state.id, right_state.id, right_state, *token) { } else if self.token_conflicts(
left_state.id,
right_state.id,
right_state.terminal_entries.keys(),
*token,
) {
return true; return true;
} }
} }
for token in right_state.terminal_entries.keys() { for token in right_state.terminal_entries.keys() {
if !left_state.terminal_entries.contains_key(token) if !left_state.terminal_entries.contains_key(token)
&& self.token_conflicts(left_state.id, right_state.id, left_state, *token) && self.token_conflicts(
left_state.id,
right_state.id,
left_state.terminal_entries.keys(),
*token,
)
{ {
return true; return true;
} }
@ -248,7 +256,7 @@ impl Minimizer<'_> {
let group1 = group_ids_by_state_id[*s1]; let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2]; let group2 = group_ids_by_state_id[*s2];
if group1 != group2 { if group1 != group2 {
debug!( info!(
"split states {} {} - successors for {} are split: {s1} {s2}", "split states {} {} - successors for {} are split: {s1} {s2}",
state1.id, state1.id,
state2.id, state2.id,
@ -264,12 +272,12 @@ impl Minimizer<'_> {
for (symbol, s1) in &state1.nonterminal_entries { for (symbol, s1) in &state1.nonterminal_entries {
if let Some(s2) = state2.nonterminal_entries.get(symbol) { if let Some(s2) = state2.nonterminal_entries.get(symbol) {
match (s1, s2) { match (s1, s2) {
(GotoAction::ShiftExtra, GotoAction::ShiftExtra) => {} (GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
(GotoAction::Goto(s1), GotoAction::Goto(s2)) => { (GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
let group1 = group_ids_by_state_id[*s1]; let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2]; let group2 = group_ids_by_state_id[*s2];
if group1 != group2 { if group1 != group2 {
debug!( info!(
"split states {} {} - successors for {} are split: {s1} {s2}", "split states {} {} - successors for {} are split: {s1} {s2}",
state1.id, state1.id,
state2.id, state2.id,
@ -299,14 +307,16 @@ impl Minimizer<'_> {
let actions1 = &entry1.actions; let actions1 = &entry1.actions;
let actions2 = &entry2.actions; let actions2 = &entry2.actions;
if actions1.len() != actions2.len() { if actions1.len() != actions2.len() {
debug!( info!(
"split states {state_id1} {state_id2} - differing action counts for token {}", "split states {state_id1} {state_id2} - differing action counts for token {}",
self.symbol_name(token) self.symbol_name(token)
); );
return true; return true;
} }
for (action1, action2) in actions1.iter().zip(actions2.iter()) { for (i, action1) in actions1.iter().enumerate() {
let action2 = &actions2[i];
// Two shift actions are equivalent if their destinations are in the same group. // Two shift actions are equivalent if their destinations are in the same group.
if let ( if let (
ParseAction::Shift { ParseAction::Shift {
@ -324,13 +334,13 @@ impl Minimizer<'_> {
if group1 == group2 && is_repetition1 == is_repetition2 { if group1 == group2 && is_repetition1 == is_repetition2 {
continue; continue;
} }
debug!( info!(
"split states {state_id1} {state_id2} - successors for {} are split: {s1} {s2}", "split states {state_id1} {state_id2} - successors for {} are split: {s1} {s2}",
self.symbol_name(token), self.symbol_name(token),
); );
return true; return true;
} else if action1 != action2 { } else if action1 != action2 {
debug!( info!(
"split states {state_id1} {state_id2} - unequal actions for {}", "split states {state_id1} {state_id2} - unequal actions for {}",
self.symbol_name(token), self.symbol_name(token),
); );
@ -341,32 +351,28 @@ impl Minimizer<'_> {
false false
} }
fn token_conflicts( fn token_conflicts<'b>(
&self, &self,
left_id: ParseStateId, left_id: ParseStateId,
right_id: ParseStateId, right_id: ParseStateId,
right_state: &ParseState, existing_tokens: impl Iterator<Item = &'b Symbol>,
new_token: Symbol, new_token: Symbol,
) -> bool { ) -> bool {
if new_token == Symbol::end_of_nonterminal_extra() { if new_token == Symbol::end_of_nonterminal_extra() {
debug!("split states {left_id} {right_id} - end of non-terminal extra",); info!("split states {left_id} {right_id} - end of non-terminal extra",);
return true; return true;
} }
// Do not add external tokens; they could conflict lexically with any of the state's // Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens. // existing lookahead tokens.
if new_token.is_external() { if new_token.is_external() {
debug!( info!(
"split states {left_id} {right_id} - external token {}", "split states {left_id} {right_id} - external token {}",
self.symbol_name(&new_token), self.symbol_name(&new_token),
); );
return true; return true;
} }
if right_state.reserved_words.contains(&new_token) {
return false;
}
// Do not add tokens which are both internal and external. Their validity could // Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner. // influence the behavior of the external scanner.
if self if self
@ -375,7 +381,7 @@ impl Minimizer<'_> {
.iter() .iter()
.any(|external| external.corresponding_internal_token == Some(new_token)) .any(|external| external.corresponding_internal_token == Some(new_token))
{ {
debug!( info!(
"split states {left_id} {right_id} - internal/external token {}", "split states {left_id} {right_id} - internal/external token {}",
self.symbol_name(&new_token), self.symbol_name(&new_token),
); );
@ -383,30 +389,23 @@ impl Minimizer<'_> {
} }
// Do not add a token if it conflicts with an existing token. // Do not add a token if it conflicts with an existing token.
for token in right_state.terminal_entries.keys().copied() { for token in existing_tokens {
if !token.is_terminal() { if token.is_terminal()
continue; && !(self.syntax_grammar.word_token == Some(*token)
} && self.keywords.contains(&new_token))
if self.syntax_grammar.word_token == Some(token) && self.keywords.contains(&new_token) { && !(self.syntax_grammar.word_token == Some(new_token)
continue; && self.keywords.contains(token))
} && (self
if self.syntax_grammar.word_token == Some(new_token) && self.keywords.contains(&token) {
continue;
}
if self
.token_conflict_map
.does_conflict(new_token.index, token.index)
|| self
.token_conflict_map .token_conflict_map
.does_match_same_string(new_token.index, token.index) .does_conflict(new_token.index, token.index)
|| self
.token_conflict_map
.does_match_same_string(new_token.index, token.index))
{ {
debug!( info!(
"split states {} {} - token {} conflicts with {}", "split states {left_id} {right_id} - token {} conflicts with {}",
left_id,
right_id,
self.symbol_name(&new_token), self.symbol_name(&new_token),
self.symbol_name(&token), self.symbol_name(token),
); );
return true; return true;
} }

View file

@ -8,32 +8,30 @@ mod token_conflicts;
use std::collections::{BTreeSet, HashMap}; use std::collections::{BTreeSet, HashMap};
use anyhow::Result;
pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT; pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT;
use build_parse_table::BuildTableResult; use log::info;
pub use build_parse_table::ParseTableBuilderError;
use log::{debug, info};
use self::{ use self::{
build_lex_table::build_lex_table, build_lex_table::build_lex_table,
build_parse_table::{build_parse_table, ParseStateInfo}, build_parse_table::{build_parse_table, ParseStateInfo},
coincident_tokens::CoincidentTokenIndex, coincident_tokens::CoincidentTokenIndex,
item_set_builder::ParseItemSetBuilder,
minimize_parse_table::minimize_parse_table, minimize_parse_table::minimize_parse_table,
token_conflicts::TokenConflictMap, token_conflicts::TokenConflictMap,
}; };
use crate::{ use crate::generate::{
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}, grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor}, nfa::{CharacterSet, NfaCursor},
node_types::VariableInfo, node_types::VariableInfo,
rules::{AliasMap, Symbol, SymbolType, TokenSet}, rules::{AliasMap, Symbol, SymbolType, TokenSet},
tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}, tables::{LexTable, ParseAction, ParseTable, ParseTableEntry},
OptLevel,
}; };
pub struct Tables { pub struct Tables {
pub parse_table: ParseTable, pub parse_table: ParseTable,
pub main_lex_table: LexTable, pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable, pub keyword_lex_table: LexTable,
pub word_token: Option<Symbol>,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>, pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
} }
@ -44,17 +42,9 @@ pub fn build_tables(
variable_info: &[VariableInfo], variable_info: &[VariableInfo],
inlines: &InlinedProductionMap, inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>, report_symbol_name: Option<&str>,
optimizations: OptLevel, ) -> Result<Tables> {
) -> BuildTableResult<Tables> { let (mut parse_table, following_tokens, parse_state_info) =
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines); build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
let following_tokens =
get_following_tokens(syntax_grammar, lexical_grammar, inlines, &item_set_builder);
let (mut parse_table, parse_state_info) = build_parse_table(
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords( let keywords = identify_keywords(
@ -80,7 +70,6 @@ pub fn build_tables(
simple_aliases, simple_aliases,
&token_conflict_map, &token_conflict_map,
&keywords, &keywords,
optimizations,
); );
let lex_tables = build_lex_table( let lex_tables = build_lex_table(
&mut parse_table, &mut parse_table,
@ -103,59 +92,15 @@ pub fn build_tables(
); );
} }
if parse_table.states.len() > u16::MAX as usize {
Err(ParseTableBuilderError::StateCount(parse_table.states.len()))?;
}
Ok(Tables { Ok(Tables {
parse_table, parse_table,
main_lex_table: lex_tables.main_lex_table, main_lex_table: lex_tables.main_lex_table,
keyword_lex_table: lex_tables.keyword_lex_table, keyword_lex_table: lex_tables.keyword_lex_table,
large_character_sets: lex_tables.large_character_sets, large_character_sets: lex_tables.large_character_sets,
word_token: syntax_grammar.word_token,
}) })
} }
fn get_following_tokens(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) -> Vec<TokenSet> {
let mut result = vec![TokenSet::new(); lexical_grammar.variables.len()];
let productions = syntax_grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
let right_reserved_tokens = builder.reserved_first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
if let Some(reserved_tokens) = right_reserved_tokens {
result[left_token.index].insert_all_terminals(reserved_tokens);
}
}
}
}
}
for extra in &syntax_grammar.extra_symbols {
if extra.is_terminal() {
for entry in &mut result {
entry.insert(*extra);
}
result[extra.index] = all_tokens.clone();
}
}
result
}
fn populate_error_state( fn populate_error_state(
parse_table: &mut ParseTable, parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar, syntax_grammar: &SyntaxGrammar,
@ -179,7 +124,7 @@ fn populate_error_state(
if conflicts_with_other_tokens { if conflicts_with_other_tokens {
None None
} else { } else {
debug!( info!(
"error recovery - token {} has no conflicts", "error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name lexical_grammar.variables[i].name
); );
@ -205,14 +150,14 @@ fn populate_error_state(
!coincident_token_index.contains(symbol, *t) !coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index) && token_conflict_map.does_conflict(symbol.index, t.index)
}) { }) {
debug!( info!(
"error recovery - exclude token {} because of conflict with {}", "error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
); );
continue; continue;
} }
} }
debug!( info!(
"error recovery - include token {}", "error recovery - include token {}",
lexical_grammar.variables[i].name lexical_grammar.variables[i].name
); );
@ -263,7 +208,7 @@ fn populate_used_symbols(
// ensure that a subtree's symbol can be successfully reassigned to the word token // ensure that a subtree's symbol can be successfully reassigned to the word token
// without having to move the subtree to the heap. // without having to move the subtree to the heap.
// See https://github.com/tree-sitter/tree-sitter/issues/258 // See https://github.com/tree-sitter/tree-sitter/issues/258
if syntax_grammar.word_token.is_some_and(|t| t.index == i) { if syntax_grammar.word_token.map_or(false, |t| t.index == i) {
parse_table.symbols.insert(1, Symbol::terminal(i)); parse_table.symbols.insert(1, Symbol::terminal(i));
} else { } else {
parse_table.symbols.push(Symbol::terminal(i)); parse_table.symbols.push(Symbol::terminal(i));
@ -345,7 +290,7 @@ fn identify_keywords(
&& token_conflict_map.does_match_same_string(i, word_token.index) && token_conflict_map.does_match_same_string(i, word_token.index)
&& !token_conflict_map.does_match_different_string(i, word_token.index) && !token_conflict_map.does_match_different_string(i, word_token.index)
{ {
debug!( info!(
"Keywords - add candidate {}", "Keywords - add candidate {}",
lexical_grammar.variables[i].name lexical_grammar.variables[i].name
); );
@ -364,7 +309,7 @@ fn identify_keywords(
if other_token != *token if other_token != *token
&& token_conflict_map.does_match_same_string(other_token.index, token.index) && token_conflict_map.does_match_same_string(other_token.index, token.index)
{ {
debug!( info!(
"Keywords - exclude {} because it matches the same string as {}", "Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name, lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name lexical_grammar.variables[other_token.index].name
@ -406,7 +351,7 @@ fn identify_keywords(
word_token.index, word_token.index,
other_index, other_index,
) { ) {
debug!( info!(
"Keywords - exclude {} because of conflict with {}", "Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name, lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name lexical_grammar.variables[other_index].name
@ -415,7 +360,7 @@ fn identify_keywords(
} }
} }
debug!( info!(
"Keywords - include {}", "Keywords - include {}",
lexical_grammar.variables[token.index].name, lexical_grammar.variables[token.index].name,
); );
@ -469,9 +414,9 @@ fn report_state_info<'a>(
for (i, state) in parse_table.states.iter().enumerate() { for (i, state) in parse_table.states.iter().enumerate() {
all_state_indices.insert(i); all_state_indices.insert(i);
let item_set = &parse_state_info[state.id]; let item_set = &parse_state_info[state.id];
for entry in &item_set.1.entries { for (item, _) in &item_set.1.entries {
if !entry.item.is_augmented() { if !item.is_augmented() {
symbols_with_state_indices[entry.item.variable_index as usize] symbols_with_state_indices[item.variable_index as usize]
.1 .1
.insert(i); .insert(i);
} }
@ -487,14 +432,14 @@ fn report_state_info<'a>(
.max() .max()
.unwrap(); .unwrap();
for (symbol, states) in &symbols_with_state_indices { for (symbol, states) in &symbols_with_state_indices {
info!( eprintln!(
"{:width$}\t{}", "{:width$}\t{}",
syntax_grammar.variables[symbol.index].name, syntax_grammar.variables[symbol.index].name,
states.len(), states.len(),
width = max_symbol_name_length width = max_symbol_name_length
); );
} }
info!(""); eprintln!();
let state_indices = if report_symbol_name == "*" { let state_indices = if report_symbol_name == "*" {
Some(&all_state_indices) Some(&all_state_indices)
@ -517,27 +462,22 @@ fn report_state_info<'a>(
for state_index in state_indices { for state_index in state_indices {
let id = parse_table.states[state_index].id; let id = parse_table.states[state_index].id;
let (preceding_symbols, item_set) = &parse_state_info[id]; let (preceding_symbols, item_set) = &parse_state_info[id];
info!("state index: {state_index}"); eprintln!("state index: {state_index}");
info!("state id: {id}"); eprintln!("state id: {id}");
info!( eprint!("symbol sequence:");
"symbol sequence: {}", for symbol in preceding_symbols {
preceding_symbols let name = if symbol.is_terminal() {
.iter() &lexical_grammar.variables[symbol.index].name
.map(|symbol| { } else if symbol.is_external() {
if symbol.is_terminal() { &syntax_grammar.external_tokens[symbol.index].name
lexical_grammar.variables[symbol.index].name.clone() } else {
} else if symbol.is_external() { &syntax_grammar.variables[symbol.index].name
syntax_grammar.external_tokens[symbol.index].name.clone() };
} else { eprint!(" {name}");
syntax_grammar.variables[symbol.index].name.clone() }
} eprintln!(
})
.collect::<Vec<_>>()
.join(" ")
);
info!(
"\nitems:\n{}", "\nitems:\n{}",
item::ParseItemSetDisplay(item_set, syntax_grammar, lexical_grammar), self::item::ParseItemSetDisplay(item_set, syntax_grammar, lexical_grammar,),
); );
} }
} }

View file

@ -1,6 +1,6 @@
use std::{cmp::Ordering, collections::HashSet, fmt}; use std::{cmp::Ordering, collections::HashSet, fmt};
use crate::{ use crate::generate::{
build_tables::item::TokenSetDisplay, build_tables::item::TokenSetDisplay,
grammars::{LexicalGrammar, SyntaxGrammar}, grammars::{LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor, NfaTransition}, nfa::{CharacterSet, NfaCursor, NfaTransition},
@ -28,7 +28,7 @@ pub struct TokenConflictMap<'a> {
impl<'a> TokenConflictMap<'a> { impl<'a> TokenConflictMap<'a> {
/// Create a token conflict map based on a lexical grammar, which describes the structure /// Create a token conflict map based on a lexical grammar, which describes the structure
/// of each token, and a `following_token` map, which indicates which tokens may be appear /// each token, and a `following_token` map, which indicates which tokens may be appear
/// immediately after each other token. /// immediately after each other token.
/// ///
/// This analyzes the possible kinds of overlap between each pair of tokens and stores /// This analyzes the possible kinds of overlap between each pair of tokens and stores
@ -145,7 +145,7 @@ impl<'a> TokenConflictMap<'a> {
} }
} }
impl fmt::Debug for TokenConflictMap<'_> { impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "TokenConflictMap {{")?; writeln!(f, "TokenConflictMap {{")?;
@ -373,7 +373,7 @@ fn compute_conflict_status(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::{
grammars::{Variable, VariableType}, grammars::{Variable, VariableType},
prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}, prepare_grammar::{expand_tokens, ExtractedLexicalGrammar},
rules::{Precedence, Rule, Symbol}, rules::{Precedence, Rule, Symbol},

View file

@ -3,7 +3,7 @@ pub fn split_state_id_groups<S>(
state_ids_by_group_id: &mut Vec<Vec<usize>>, state_ids_by_group_id: &mut Vec<Vec<usize>>,
group_ids_by_state_id: &mut [usize], group_ids_by_state_id: &mut [usize],
start_group_id: usize, start_group_id: usize,
mut should_split: impl FnMut(&S, &S, &[usize]) -> bool, mut f: impl FnMut(&S, &S, &[usize]) -> bool,
) -> bool { ) -> bool {
let mut result = false; let mut result = false;
@ -33,7 +33,7 @@ pub fn split_state_id_groups<S>(
} }
let right_state = &states[right_state_id]; let right_state = &states[right_state_id];
if should_split(left_state, right_state, group_ids_by_state_id) { if f(left_state, right_state, group_ids_by_state_id) {
split_state_ids.push(right_state_id); split_state_ids.push(right_state_id);
} }

View file

@ -16,7 +16,6 @@ function alias(rule, value) {
result.value = value.symbol.name; result.value = value.symbol.name;
return result; return result;
case Object: case Object:
case GrammarSymbol:
if (typeof value.type === 'string' && value.type === 'SYMBOL') { if (typeof value.type === 'string' && value.type === 'SYMBOL') {
result.named = true; result.named = true;
result.value = value.name; result.value = value.name;
@ -70,7 +69,7 @@ function prec(number, rule) {
}; };
} }
prec.left = function (number, rule) { prec.left = function(number, rule) {
if (rule == null) { if (rule == null) {
rule = number; rule = number;
number = 0; number = 0;
@ -92,7 +91,7 @@ prec.left = function (number, rule) {
}; };
} }
prec.right = function (number, rule) { prec.right = function(number, rule) {
if (rule == null) { if (rule == null) {
rule = number; rule = number;
number = 0; number = 0;
@ -114,7 +113,7 @@ prec.right = function (number, rule) {
}; };
} }
prec.dynamic = function (number, rule) { prec.dynamic = function(number, rule) {
checkPrecedence(number); checkPrecedence(number);
checkArguments( checkArguments(
arguments, arguments,
@ -154,26 +153,11 @@ function seq(...elements) {
}; };
} }
class GrammarSymbol {
constructor(name) {
this.type = "SYMBOL";
this.name = name;
}
}
function reserved(wordset, rule) {
if (typeof wordset !== 'string') {
throw new Error('Invalid reserved word set name: ' + wordset)
}
return {
type: "RESERVED",
content: normalize(rule),
context_name: wordset,
}
}
function sym(name) { function sym(name) {
return new GrammarSymbol(name); return {
type: "SYMBOL",
name
};
} }
function token(value) { function token(value) {
@ -184,7 +168,7 @@ function token(value) {
}; };
} }
token.immediate = function (value) { token.immediate = function(value) {
checkArguments(arguments, arguments.length, token.immediate, 'token.immediate', '', 'literal'); checkArguments(arguments, arguments.length, token.immediate, 'token.immediate', '', 'literal');
return { return {
type: "IMMEDIATE_TOKEN", type: "IMMEDIATE_TOKEN",
@ -211,11 +195,6 @@ function normalize(value) {
type: 'PATTERN', type: 'PATTERN',
value: value.source value: value.source
}; };
case RustRegex:
return {
type: 'PATTERN',
value: value.value
};
case ReferenceError: case ReferenceError:
throw value throw value
default: default:
@ -257,7 +236,6 @@ function grammar(baseGrammar, options) {
inline: [], inline: [],
supertypes: [], supertypes: [],
precedences: [], precedences: [],
reserved: {},
}; };
} else { } else {
baseGrammar = baseGrammar.grammar; baseGrammar = baseGrammar.grammar;
@ -331,28 +309,6 @@ function grammar(baseGrammar, options) {
} }
} }
let reserved = baseGrammar.reserved;
if (options.reserved) {
if (typeof options.reserved !== "object") {
throw new Error("Grammar's 'reserved' property must be an object.");
}
for (const reservedWordSetName of Object.keys(options.reserved)) {
const reservedWordSetFn = options.reserved[reservedWordSetName]
if (typeof reservedWordSetFn !== "function") {
throw new Error(`Grammar reserved word sets must all be functions. '${reservedWordSetName}' is not.`);
}
const reservedTokens = reservedWordSetFn.call(ruleBuilder, ruleBuilder, baseGrammar.reserved[reservedWordSetName]);
if (!Array.isArray(reservedTokens)) {
throw new Error(`Grammar's reserved word set functions must all return arrays of rules. '${reservedWordSetName}' does not.`);
}
reserved[reservedWordSetName] = reservedTokens.map(normalize);
}
}
let extras = baseGrammar.extras.slice(); let extras = baseGrammar.extras.slice();
if (options.extras) { if (options.extras) {
if (typeof options.extras !== "function") { if (typeof options.extras !== "function") {
@ -483,17 +439,10 @@ function grammar(baseGrammar, options) {
externals, externals,
inline, inline,
supertypes, supertypes,
reserved,
}, },
}; };
} }
class RustRegex {
constructor(value) {
this.value = value;
}
}
function checkArguments(args, ruleCount, caller, callerName, suffix = '', argType = 'rule') { function checkArguments(args, ruleCount, caller, callerName, suffix = '', argType = 'rule') {
// Allow for .map() usage where additional arguments are index and the entire array. // Allow for .map() usage where additional arguments are index and the entire array.
const isMapCall = ruleCount === 3 && typeof args[1] === 'number' && Array.isArray(args[2]); const isMapCall = ruleCount === 3 && typeof args[1] === 'number' && Array.isArray(args[2]);
@ -517,7 +466,6 @@ function checkPrecedence(value) {
} }
function getEnv(name) { function getEnv(name) {
if (globalThis.native) return globalThis.__ts_grammar_path;
if (globalThis.process) return process.env[name]; // Node/Bun if (globalThis.process) return process.env[name]; // Node/Bun
if (globalThis.Deno) return Deno.env.get(name); // Deno if (globalThis.Deno) return Deno.env.get(name); // Deno
throw Error("Unsupported JS runtime"); throw Error("Unsupported JS runtime");
@ -530,31 +478,16 @@ globalThis.optional = optional;
globalThis.prec = prec; globalThis.prec = prec;
globalThis.repeat = repeat; globalThis.repeat = repeat;
globalThis.repeat1 = repeat1; globalThis.repeat1 = repeat1;
globalThis.reserved = reserved;
globalThis.seq = seq; globalThis.seq = seq;
globalThis.sym = sym; globalThis.sym = sym;
globalThis.token = token; globalThis.token = token;
globalThis.grammar = grammar; globalThis.grammar = grammar;
globalThis.field = field; globalThis.field = field;
globalThis.RustRegex = RustRegex;
const grammarPath = getEnv("TREE_SITTER_GRAMMAR_PATH"); const result = await import(getEnv("TREE_SITTER_GRAMMAR_PATH"));
let result = await import(grammarPath); const output = JSON.stringify(result.default?.grammar ?? result.grammar);
let grammarObj = result.default?.grammar ?? result.grammar;
if (globalThis.native && !grammarObj) { if (globalThis.process) { // Node/Bun
grammarObj = module.exports.grammar;
}
const object = {
"$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/grammar.schema.json",
...grammarObj,
};
const output = JSON.stringify(object);
if (globalThis.native) {
globalThis.output = output;
} else if (globalThis.process) { // Node/Bun
process.stdout.write(output); process.stdout.write(output);
} else if (globalThis.Deno) { // Deno } else if (globalThis.Deno) { // Deno
Deno.stdout.writeSync(new TextEncoder().encode(output)); Deno.stdout.writeSync(new TextEncoder().encode(output));

View file

@ -1,6 +1,6 @@
{ {
"$schema": "http://json-schema.org/draft-07/schema#", "$schema": "http://json-schema.org/draft-07/schema#",
"title": "Tree-sitter grammar specification", "title": "tree-sitter grammar specification",
"type": "object", "type": "object",
"required": ["name", "rules"], "required": ["name", "rules"],
@ -8,18 +8,14 @@
"additionalProperties": false, "additionalProperties": false,
"properties": { "properties": {
"$schema": {
"type": "string"
},
"name": { "name": {
"description": "The name of the grammar", "description": "the name of the grammar",
"type": "string", "type": "string",
"pattern": "^[a-zA-Z_]\\w*" "pattern": "^[a-zA-Z_]\\w*"
}, },
"inherits": { "inherits": {
"description": "The name of the parent grammar", "description": "the name of the parent grammar",
"type": "string", "type": "string",
"pattern": "^[a-zA-Z_]\\w*" "pattern": "^[a-zA-Z_]\\w*"
}, },
@ -57,20 +53,6 @@
} }
}, },
"reserved": {
"type": "object",
"patternProperties": {
"^[a-zA-Z_]\\w*$": {
"type": "array",
"uniqueItems": true,
"items": {
"$ref": "#/definitions/rule"
}
}
},
"additionalProperties": false
},
"externals": { "externals": {
"type": "array", "type": "array",
"uniqueItems": true, "uniqueItems": true,
@ -107,11 +89,11 @@
}, },
"supertypes": { "supertypes": {
"description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types.", "description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types.",
"type": "array", "type": "array",
"uniqueItems": true, "uniqueItems": true,
"items": { "items": {
"description": "The name of a rule in `rules` or `extras`", "description": "the name of a rule in `rules` or `extras`",
"type": "string" "type": "string"
} }
} }
@ -123,7 +105,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "BLANK" "pattern": "^BLANK$"
} }
}, },
"required": ["type"] "required": ["type"]
@ -134,7 +116,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "STRING" "pattern": "^STRING$"
}, },
"value": { "value": {
"type": "string" "type": "string"
@ -148,7 +130,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "PATTERN" "pattern": "^PATTERN$"
}, },
"value": { "type": "string" }, "value": { "type": "string" },
"flags": { "type": "string" } "flags": { "type": "string" }
@ -161,7 +143,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "SYMBOL" "pattern": "^SYMBOL$"
}, },
"name": { "type": "string" } "name": { "type": "string" }
}, },
@ -173,7 +155,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "SEQ" "pattern": "^SEQ$"
}, },
"members": { "members": {
"type": "array", "type": "array",
@ -190,7 +172,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "CHOICE" "pattern": "^CHOICE$"
}, },
"members": { "members": {
"type": "array", "type": "array",
@ -207,10 +189,14 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "ALIAS" "pattern": "^ALIAS$"
},
"value": {
"type": "string"
},
"named": {
"type": "boolean"
}, },
"value": { "type": "string" },
"named": { "type": "boolean" },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
} }
@ -223,7 +209,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "REPEAT" "pattern": "^REPEAT$"
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -237,7 +223,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "REPEAT1" "pattern": "^REPEAT1$"
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -246,30 +232,12 @@
"required": ["type", "content"] "required": ["type", "content"]
}, },
"reserved-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "RESERVED"
},
"context_name": { "type": "string" },
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "context_name", "content"]
},
"token-rule": { "token-rule": {
"type": "object", "type": "object",
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"enum": [ "pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
"TOKEN",
"IMMEDIATE_TOKEN"
]
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -283,7 +251,7 @@
"name": { "type": "string" }, "name": { "type": "string" },
"type": { "type": {
"type": "string", "type": "string",
"const": "FIELD" "pattern": "^FIELD$"
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -297,12 +265,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"enum": [ "pattern": "^(PREC|PREC_LEFT|PREC_RIGHT|PREC_DYNAMIC)$"
"PREC",
"PREC_LEFT",
"PREC_RIGHT",
"PREC_DYNAMIC"
]
}, },
"value": { "value": {
"oneof": [ "oneof": [
@ -328,7 +291,6 @@
{ "$ref": "#/definitions/choice-rule" }, { "$ref": "#/definitions/choice-rule" },
{ "$ref": "#/definitions/repeat1-rule" }, { "$ref": "#/definitions/repeat1-rule" },
{ "$ref": "#/definitions/repeat-rule" }, { "$ref": "#/definitions/repeat-rule" },
{ "$ref": "#/definitions/reserved-rule" },
{ "$ref": "#/definitions/token-rule" }, { "$ref": "#/definitions/token-rule" },
{ "$ref": "#/definitions/field-rule" }, { "$ref": "#/definitions/field-rule" },
{ "$ref": "#/definitions/prec-rule" } { "$ref": "#/definitions/prec-rule" }

View file

@ -0,0 +1,685 @@
use std::{
fs,
fs::File,
io::BufReader,
path::{Path, PathBuf},
str,
};
use anyhow::{anyhow, Context, Result};
use heck::{ToKebabCase, ToShoutySnakeCase, ToSnakeCase, ToUpperCamelCase};
use indoc::indoc;
use serde::Deserialize;
use serde_json::{json, Map, Value};
use super::write_file;
const CLI_VERSION: &str = env!("CARGO_PKG_VERSION");
const CLI_VERSION_PLACEHOLDER: &str = "CLI_VERSION";
const PARSER_NAME_PLACEHOLDER: &str = "PARSER_NAME";
const CAMEL_PARSER_NAME_PLACEHOLDER: &str = "CAMEL_PARSER_NAME";
const UPPER_PARSER_NAME_PLACEHOLDER: &str = "UPPER_PARSER_NAME";
const LOWER_PARSER_NAME_PLACEHOLDER: &str = "LOWER_PARSER_NAME";
const GRAMMAR_JS_TEMPLATE: &str = include_str!("./templates/grammar.js");
const PACKAGE_JSON_TEMPLATE: &str = include_str!("./templates/package.json");
const GITIGNORE_TEMPLATE: &str = include_str!("./templates/gitignore");
const GITATTRIBUTES_TEMPLATE: &str = include_str!("./templates/gitattributes");
const EDITORCONFIG_TEMPLATE: &str = include_str!("./templates/.editorconfig");
const RUST_BINDING_VERSION: &str = env!("CARGO_PKG_VERSION");
const RUST_BINDING_VERSION_PLACEHOLDER: &str = "RUST_BINDING_VERSION";
const LIB_RS_TEMPLATE: &str = include_str!("./templates/lib.rs");
const BUILD_RS_TEMPLATE: &str = include_str!("./templates/build.rs");
const CARGO_TOML_TEMPLATE: &str = include_str!("./templates/_cargo.toml");
const INDEX_JS_TEMPLATE: &str = include_str!("./templates/index.js");
const INDEX_D_TS_TEMPLATE: &str = include_str!("./templates/index.d.ts");
const JS_BINDING_CC_TEMPLATE: &str = include_str!("./templates/js-binding.cc");
const BINDING_GYP_TEMPLATE: &str = include_str!("./templates/binding.gyp");
const BINDING_TEST_JS_TEMPLATE: &str = include_str!("./templates/binding_test.js");
const MAKEFILE_TEMPLATE: &str = include_str!("./templates/makefile");
const PARSER_NAME_H_TEMPLATE: &str = include_str!("./templates/PARSER_NAME.h");
const PARSER_NAME_PC_IN_TEMPLATE: &str = include_str!("./templates/PARSER_NAME.pc.in");
const GO_MOD_TEMPLATE: &str = include_str!("./templates/go.mod");
const BINDING_GO_TEMPLATE: &str = include_str!("./templates/binding.go");
const BINDING_TEST_GO_TEMPLATE: &str = include_str!("./templates/binding_test.go");
const SETUP_PY_TEMPLATE: &str = include_str!("./templates/setup.py");
const INIT_PY_TEMPLATE: &str = include_str!("./templates/__init__.py");
const INIT_PYI_TEMPLATE: &str = include_str!("./templates/__init__.pyi");
const PYPROJECT_TOML_TEMPLATE: &str = include_str!("./templates/pyproject.toml");
const PY_BINDING_C_TEMPLATE: &str = include_str!("./templates/py-binding.c");
const TEST_BINDING_PY_TEMPLATE: &str = include_str!("./templates/test_binding.py");
const PACKAGE_SWIFT_TEMPLATE: &str = include_str!("./templates/package.swift");
const TESTS_SWIFT_TEMPLATE: &str = include_str!("./templates/tests.swift");
#[derive(Deserialize, Debug)]
struct LanguageConfiguration {}
#[derive(Deserialize, Debug)]
pub struct PackageJSON {
#[serde(rename = "tree-sitter")]
tree_sitter: Option<Vec<LanguageConfiguration>>,
}
pub fn path_in_ignore(repo_path: &Path) -> bool {
[
"bindings",
"build",
"examples",
"node_modules",
"queries",
"script",
"src",
"target",
"test",
"types",
]
.iter()
.any(|dir| repo_path.ends_with(dir))
}
fn insert_after(
map: Map<String, Value>,
after: &str,
key: &str,
value: Value,
) -> Map<String, Value> {
let mut entries = map.into_iter().collect::<Vec<_>>();
let after_index = entries
.iter()
.position(|(k, _)| k == after)
.unwrap_or(entries.len() - 1)
+ 1;
entries.insert(after_index, (key.to_string(), value));
entries.into_iter().collect()
}
pub fn generate_grammar_files(
repo_path: &Path,
language_name: &str,
generate_bindings: bool,
) -> Result<()> {
let dashed_language_name = language_name.to_kebab_case();
// TODO: remove legacy code updates in v0.24.0
// Create or update package.json
let package_json_path_state = missing_path_else(
repo_path.join("package.json"),
|path| generate_file(path, PACKAGE_JSON_TEMPLATE, dashed_language_name.as_str()),
|path| {
let package_json_str =
fs::read_to_string(path).with_context(|| "Failed to read package.json")?;
let mut package_json = serde_json::from_str::<Map<String, Value>>(&package_json_str)
.with_context(|| "Failed to parse package.json")?;
if generate_bindings {
let mut updated = false;
let dependencies = package_json
.entry("dependencies".to_string())
.or_insert_with(|| Value::Object(Map::new()))
.as_object_mut()
.unwrap();
if dependencies.remove("nan").is_some() {
eprintln!("Replacing nan dependency with node-addon-api in package.json");
dependencies.insert("node-addon-api".to_string(), "^8.0.0".into());
updated = true;
}
if !dependencies.contains_key("node-gyp-build") {
eprintln!("Adding node-gyp-build dependency to package.json");
dependencies.insert("node-gyp-build".to_string(), "^4.8.1".into());
updated = true;
}
let dev_dependencies = package_json
.entry("devDependencies".to_string())
.or_insert_with(|| Value::Object(Map::new()))
.as_object_mut()
.unwrap();
if !dev_dependencies.contains_key("prebuildify") {
eprintln!("Adding prebuildify devDependency to package.json");
dev_dependencies.insert("prebuildify".to_string(), "^6.0.1".into());
updated = true;
}
let node_test = "node --test bindings/node/*_test.js";
let scripts = package_json
.entry("scripts".to_string())
.or_insert_with(|| Value::Object(Map::new()))
.as_object_mut()
.unwrap();
if !scripts.get("test").is_some_and(|v| v == node_test) {
eprintln!("Updating package.json scripts");
*scripts = Map::from_iter([
("install".to_string(), "node-gyp-build".into()),
("prestart".to_string(), "tree-sitter build --wasm".into()),
("start".to_string(), "tree-sitter playground".into()),
("test".to_string(), node_test.into()),
]);
updated = true;
}
// insert `peerDependencies` after `dependencies`
if !package_json.contains_key("peerDependencies") {
eprintln!("Adding peerDependencies to package.json");
package_json = insert_after(
package_json,
"dependencies",
"peerDependencies",
json!({"tree-sitter": "^0.21.1"}),
);
package_json = insert_after(
package_json,
"peerDependencies",
"peerDependenciesMeta",
json!({"tree_sitter": {"optional": true}}),
);
updated = true;
}
// insert `types` right after `main`
if !package_json.contains_key("types") {
eprintln!("Adding types to package.json");
package_json =
insert_after(package_json, "main", "types", "bindings/node".into());
updated = true;
}
// insert `files` right after `keywords`
if !package_json.contains_key("files") {
eprintln!("Adding files to package.json");
package_json = insert_after(
package_json,
"keywords",
"files",
json!([
"grammar.js",
"binding.gyp",
"prebuilds/**",
"bindings/node/*",
"queries/*",
"src/**",
"*.wasm"
]),
);
updated = true;
}
// insert `tree-sitter` at the end
if !package_json.contains_key("tree-sitter") {
eprintln!("Adding a `tree-sitter` section to package.json");
package_json.insert(
"tree-sitter".to_string(),
json!([{
"scope": format!("source.{language_name}"),
"injection-regex": format!("^{language_name}$"),
}]),
);
updated = true;
}
if updated {
let mut package_json_str = serde_json::to_string_pretty(&package_json)?;
package_json_str.push('\n');
write_file(path, package_json_str)?;
}
}
Ok(())
},
)?;
let package_json = match lookup_package_json_for_path(package_json_path_state.as_path()) {
Ok((_, p)) => p,
Err(e) if generate_bindings => return Err(e),
_ => return Ok(()),
};
// Do not create a grammar.js file in a repo with multiple language configs
if !package_json.has_multiple_language_configs() {
missing_path(repo_path.join("grammar.js"), |path| {
generate_file(path, GRAMMAR_JS_TEMPLATE, language_name)
})?;
}
if !generate_bindings {
// our job is done
return Ok(());
}
// Write .gitignore file
missing_path(repo_path.join(".gitignore"), |path| {
generate_file(path, GITIGNORE_TEMPLATE, language_name)
})?;
// Write .gitattributes file
missing_path(repo_path.join(".gitattributes"), |path| {
generate_file(path, GITATTRIBUTES_TEMPLATE, language_name)
})?;
// Write .editorconfig file
missing_path(repo_path.join(".editorconfig"), |path| {
generate_file(path, EDITORCONFIG_TEMPLATE, language_name)
})?;
let bindings_dir = repo_path.join("bindings");
// Generate Rust bindings
missing_path(bindings_dir.join("rust"), create_dir)?.apply(|path| {
missing_path_else(
path.join("lib.rs"),
|path| generate_file(path, LIB_RS_TEMPLATE, language_name),
|path| {
let lib_rs =
fs::read_to_string(path).with_context(|| "Failed to read lib.rs")?;
if !lib_rs.contains("tree_sitter_language") {
generate_file(path, LIB_RS_TEMPLATE, language_name)?;
eprintln!("Updated lib.rs with `tree_sitter_language` dependency");
}
Ok(())
},
)?;
missing_path_else(
path.join("build.rs"),
|path| generate_file(path, BUILD_RS_TEMPLATE, language_name),
|path| {
let build_rs =
fs::read_to_string(path).with_context(|| "Failed to read build.rs")?;
if !build_rs.contains("-utf-8") {
let index = build_rs
.find(" let parser_path = src_dir.join(\"parser.c\")")
.ok_or_else(|| anyhow!(indoc!{
"Failed to auto-update build.rs with the `/utf-8` flag for windows.
To fix this, remove `bindings/rust/build.rs` and re-run `tree-sitter generate`"}))?;
let build_rs = format!(
"{}{}{}\n{}",
&build_rs[..index],
" #[cfg(target_env = \"msvc\")]\n",
" c_config.flag(\"-utf-8\");\n",
&build_rs[index..]
);
write_file(path, build_rs)?;
eprintln!("Updated build.rs with the /utf-8 flag for Windows compilation");
}
Ok(())
},
)?;
missing_path_else(
repo_path.join("Cargo.toml"),
|path| generate_file(path, CARGO_TOML_TEMPLATE, dashed_language_name.as_str()),
|path| {
let cargo_toml =
fs::read_to_string(path).with_context(|| "Failed to read Cargo.toml")?;
if !cargo_toml.contains("tree-sitter-language") {
let start_index = cargo_toml
.find("tree-sitter = \"")
.ok_or_else(|| anyhow!("Failed to find the `tree-sitter` dependency in Cargo.toml"))?;
let version_start_index = start_index + "tree-sitter = \"".len();
let version_end_index = cargo_toml[version_start_index..]
.find('\"')
.map(|i| i + version_start_index)
.ok_or_else(|| anyhow!("Failed to find the end of the `tree-sitter` version in Cargo.toml"))?;
let cargo_toml = format!(
"{}{}{}\n{}\n{}",
&cargo_toml[..start_index],
"tree-sitter-language = \"0.1.0\"",
&cargo_toml[version_end_index + 1..],
"[dev-dependencies]",
"tree-sitter = \"0.23\"",
);
write_file(path, cargo_toml)?;
eprintln!("Updated Cargo.toml with the `tree-sitter-language` dependency");
}
Ok(())
},
)?;
Ok(())
})?;
// Generate Node bindings
missing_path(bindings_dir.join("node"), create_dir)?.apply(|path| {
missing_path_else(
path.join("index.js"),
|path| generate_file(path, INDEX_JS_TEMPLATE, language_name),
|path| {
let index_js =
fs::read_to_string(path).with_context(|| "Failed to read index.js")?;
if index_js.contains("../../build/Release") {
eprintln!("Replacing index.js with new binding API");
generate_file(path, INDEX_JS_TEMPLATE, language_name)?;
}
Ok(())
},
)?;
missing_path(path.join("index.d.ts"), |path| {
generate_file(path, INDEX_D_TS_TEMPLATE, language_name)
})?;
missing_path(path.join("binding_test.js"), |path| {
generate_file(path, BINDING_TEST_JS_TEMPLATE, language_name)
})?;
missing_path_else(
path.join("binding.cc"),
|path| generate_file(path, JS_BINDING_CC_TEMPLATE, language_name),
|path| {
let binding_cc =
fs::read_to_string(path).with_context(|| "Failed to read binding.cc")?;
if binding_cc.contains("NAN_METHOD(New) {}") {
eprintln!("Replacing binding.cc with new binding API");
generate_file(path, JS_BINDING_CC_TEMPLATE, language_name)?;
}
Ok(())
},
)?;
// Create binding.gyp, or update it with new binding API.
missing_path_else(
repo_path.join("binding.gyp"),
|path| generate_file(path, BINDING_GYP_TEMPLATE, language_name),
|path| {
let binding_gyp =
fs::read_to_string(path).with_context(|| "Failed to read binding.gyp")?;
if binding_gyp.contains("require('nan')") {
eprintln!("Replacing binding.gyp with new binding API");
generate_file(path, BINDING_GYP_TEMPLATE, language_name)?;
}
Ok(())
},
)?;
Ok(())
})?;
// Generate C bindings
missing_path(bindings_dir.join("c"), create_dir)?.apply(|path| {
missing_path(
path.join(format!("tree-sitter-{language_name}.h")),
|path| generate_file(path, PARSER_NAME_H_TEMPLATE, language_name),
)?;
missing_path(
path.join(format!("tree-sitter-{language_name}.pc.in")),
|path| generate_file(path, PARSER_NAME_PC_IN_TEMPLATE, language_name),
)?;
missing_path(repo_path.join("Makefile"), |path| {
generate_file(path, MAKEFILE_TEMPLATE, language_name)
})?;
Ok(())
})?;
// Generate Go bindings
missing_path(bindings_dir.join("go"), create_dir)?.apply(|path| {
missing_path(path.join("binding.go"), |path| {
generate_file(path, BINDING_GO_TEMPLATE, language_name)
})?;
missing_path_else(
path.join("binding_test.go"),
|path| generate_file(path, BINDING_TEST_GO_TEMPLATE, language_name),
|path| {
let binding_test_go =
fs::read_to_string(path).with_context(|| "Failed to read binding_test.go")?;
if binding_test_go.contains("smacker") {
eprintln!("Replacing binding_test.go with new binding API");
generate_file(path, BINDING_TEST_GO_TEMPLATE, language_name)?;
}
Ok(())
},
)?;
// Delete the old go.mod file that lives inside bindings/go, it now lives in the root dir
let go_mod_path = path.join("go.mod");
if go_mod_path.exists() {
fs::remove_file(go_mod_path).with_context(|| "Failed to remove old go.mod file")?;
}
missing_path(repo_path.join("go.mod"), |path| {
generate_file(path, GO_MOD_TEMPLATE, language_name)
})?;
Ok(())
})?;
// Generate Python bindings
missing_path(bindings_dir.join("python"), create_dir)?.apply(|path| {
let lang_path = path.join(format!("tree_sitter_{}", language_name.to_snake_case()));
missing_path(&lang_path, create_dir)?;
missing_path_else(
lang_path.join("binding.c"),
|path| generate_file(path, PY_BINDING_C_TEMPLATE, language_name),
|path| {
let binding_c = fs::read_to_string(path)
.with_context(|| "Failed to read bindings/python/binding.c")?;
if !binding_c.contains("PyCapsule_New") {
eprintln!("Replacing bindings/python/binding.c with new binding API");
generate_file(path, PY_BINDING_C_TEMPLATE, language_name)?;
}
Ok(())
},
)?;
missing_path(lang_path.join("__init__.py"), |path| {
generate_file(path, INIT_PY_TEMPLATE, language_name)
})?;
missing_path(lang_path.join("__init__.pyi"), |path| {
generate_file(path, INIT_PYI_TEMPLATE, language_name)
})?;
missing_path(lang_path.join("py.typed"), |path| {
generate_file(path, "", language_name) // py.typed is empty
})?;
missing_path(path.join("tests"), create_dir)?.apply(|path| {
missing_path(path.join("test_binding.py"), |path| {
generate_file(path, TEST_BINDING_PY_TEMPLATE, language_name)
})?;
Ok(())
})?;
missing_path(repo_path.join("setup.py"), |path| {
generate_file(path, SETUP_PY_TEMPLATE, language_name)
})?;
missing_path(repo_path.join("pyproject.toml"), |path| {
generate_file(path, PYPROJECT_TOML_TEMPLATE, dashed_language_name.as_str())
})?;
Ok(())
})?;
// Generate Swift bindings
missing_path(bindings_dir.join("swift"), create_dir)?.apply(|path| {
let lang_path = path.join(format!("TreeSitter{}", language_name.to_upper_camel_case()));
missing_path(&lang_path, create_dir)?;
missing_path(lang_path.join(format!("{language_name}.h")), |path| {
generate_file(path, PARSER_NAME_H_TEMPLATE, language_name)
})?;
missing_path(
path.join(format!(
"TreeSitter{}Tests",
language_name.to_upper_camel_case()
)),
create_dir,
)?
.apply(|path| {
missing_path(
path.join(format!(
"TreeSitter{}Tests.swift",
language_name.to_upper_camel_case()
)),
|path| generate_file(path, TESTS_SWIFT_TEMPLATE, language_name),
)?;
Ok(())
})?;
missing_path(repo_path.join("Package.swift"), |path| {
generate_file(path, PACKAGE_SWIFT_TEMPLATE, language_name)
})?;
Ok(())
})?;
Ok(())
}
pub fn lookup_package_json_for_path(path: &Path) -> Result<(PathBuf, PackageJSON)> {
let mut pathbuf = path.to_owned();
loop {
let package_json = pathbuf
.exists()
.then(|| -> Result<PackageJSON> {
let file =
File::open(pathbuf.as_path()).with_context(|| "Failed to open package.json")?;
serde_json::from_reader(BufReader::new(file)).context(
"Failed to parse package.json, is the `tree-sitter` section malformed?",
)
})
.transpose()?;
if let Some(package_json) = package_json {
if package_json.tree_sitter.is_some() {
return Ok((pathbuf, package_json));
}
}
pathbuf.pop(); // package.json
if !pathbuf.pop() {
return Err(anyhow!(concat!(
"Failed to locate a package.json file that has a \"tree-sitter\" section,",
" please ensure you have one, and if you don't then consult the docs",
)));
}
pathbuf.push("package.json");
}
}
fn generate_file(path: &Path, template: &str, language_name: &str) -> Result<()> {
write_file(
path,
template
.replace(
CAMEL_PARSER_NAME_PLACEHOLDER,
&language_name.to_upper_camel_case(),
)
.replace(
UPPER_PARSER_NAME_PLACEHOLDER,
&language_name.to_shouty_snake_case(),
)
.replace(
LOWER_PARSER_NAME_PLACEHOLDER,
&language_name.to_snake_case(),
)
.replace(PARSER_NAME_PLACEHOLDER, language_name)
.replace(CLI_VERSION_PLACEHOLDER, CLI_VERSION)
.replace(RUST_BINDING_VERSION_PLACEHOLDER, RUST_BINDING_VERSION),
)
}
fn create_dir(path: &Path) -> Result<()> {
fs::create_dir_all(path)
.with_context(|| format!("Failed to create {:?}", path.to_string_lossy()))
}
#[derive(PartialEq, Eq, Debug)]
enum PathState<P>
where
P: AsRef<Path>,
{
Exists(P),
Missing(P),
}
#[allow(dead_code)]
impl<P> PathState<P>
where
P: AsRef<Path>,
{
fn exists(&self, mut action: impl FnMut(&Path) -> Result<()>) -> Result<&Self> {
if let Self::Exists(path) = self {
action(path.as_ref())?;
}
Ok(self)
}
fn missing(&self, mut action: impl FnMut(&Path) -> Result<()>) -> Result<&Self> {
if let Self::Missing(path) = self {
action(path.as_ref())?;
}
Ok(self)
}
fn apply(&self, mut action: impl FnMut(&Path) -> Result<()>) -> Result<&Self> {
action(self.as_path())?;
Ok(self)
}
fn apply_state(&self, mut action: impl FnMut(&Self) -> Result<()>) -> Result<&Self> {
action(self)?;
Ok(self)
}
fn as_path(&self) -> &Path {
match self {
Self::Exists(path) | Self::Missing(path) => path.as_ref(),
}
}
}
fn missing_path<P, F>(path: P, mut action: F) -> Result<PathState<P>>
where
P: AsRef<Path>,
F: FnMut(&Path) -> Result<()>,
{
let path_ref = path.as_ref();
if !path_ref.exists() {
action(path_ref)?;
Ok(PathState::Missing(path))
} else {
Ok(PathState::Exists(path))
}
}
fn missing_path_else<P, T, F>(path: P, mut action: T, mut else_action: F) -> Result<PathState<P>>
where
P: AsRef<Path>,
T: FnMut(&Path) -> Result<()>,
F: FnMut(&Path) -> Result<()>,
{
let path_ref = path.as_ref();
if !path_ref.exists() {
action(path_ref)?;
Ok(PathState::Missing(path))
} else {
else_action(path_ref)?;
Ok(PathState::Exists(path))
}
}
impl PackageJSON {
fn has_multiple_language_configs(&self) -> bool {
self.tree_sitter.as_ref().is_some_and(|c| c.len() > 1)
}
}

View file

@ -2,7 +2,7 @@ use std::{collections::HashMap, fmt};
use super::{ use super::{
nfa::Nfa, nfa::Nfa,
rules::{Alias, Associativity, Precedence, Rule, Symbol, TokenSet}, rules::{Alias, Associativity, Precedence, Rule, Symbol},
}; };
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@ -39,13 +39,6 @@ pub struct InputGrammar {
pub variables_to_inline: Vec<String>, pub variables_to_inline: Vec<String>,
pub supertype_symbols: Vec<String>, pub supertype_symbols: Vec<String>,
pub word_token: Option<String>, pub word_token: Option<String>,
pub reserved_words: Vec<ReservedWordContext<Rule>>,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub struct ReservedWordContext<T> {
pub name: String,
pub reserved_words: Vec<T>,
} }
// Extracted lexical grammar // Extracted lexical grammar
@ -73,20 +66,8 @@ pub struct ProductionStep {
pub associativity: Option<Associativity>, pub associativity: Option<Associativity>,
pub alias: Option<Alias>, pub alias: Option<Alias>,
pub field_name: Option<String>, pub field_name: Option<String>,
pub reserved_word_set_id: ReservedWordSetId,
} }
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ReservedWordSetId(pub usize);
impl fmt::Display for ReservedWordSetId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
pub const NO_RESERVED_WORDS: ReservedWordSetId = ReservedWordSetId(usize::MAX);
#[derive(Clone, Debug, Default, PartialEq, Eq)] #[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Production { pub struct Production {
pub steps: Vec<ProductionStep>, pub steps: Vec<ProductionStep>,
@ -123,44 +104,50 @@ pub struct SyntaxGrammar {
pub variables_to_inline: Vec<Symbol>, pub variables_to_inline: Vec<Symbol>,
pub word_token: Option<Symbol>, pub word_token: Option<Symbol>,
pub precedence_orderings: Vec<Vec<PrecedenceEntry>>, pub precedence_orderings: Vec<Vec<PrecedenceEntry>>,
pub reserved_word_sets: Vec<TokenSet>,
} }
#[cfg(test)] #[cfg(test)]
impl ProductionStep { impl ProductionStep {
#[must_use] pub const fn new(symbol: Symbol) -> Self {
pub fn new(symbol: Symbol) -> Self {
Self { Self {
symbol, symbol,
precedence: Precedence::None, precedence: Precedence::None,
associativity: None, associativity: None,
alias: None, alias: None,
field_name: None, field_name: None,
reserved_word_set_id: ReservedWordSetId::default(),
} }
} }
pub fn with_prec( pub fn with_prec(self, precedence: Precedence, associativity: Option<Associativity>) -> Self {
mut self, Self {
precedence: Precedence, symbol: self.symbol,
associativity: Option<Associativity>, precedence,
) -> Self { associativity,
self.precedence = precedence; alias: self.alias,
self.associativity = associativity; field_name: self.field_name,
self }
} }
pub fn with_alias(mut self, value: &str, is_named: bool) -> Self { pub fn with_alias(self, value: &str, is_named: bool) -> Self {
self.alias = Some(Alias { Self {
value: value.to_string(), symbol: self.symbol,
is_named, precedence: self.precedence,
}); associativity: self.associativity,
self alias: Some(Alias {
value: value.to_string(),
is_named,
}),
field_name: self.field_name,
}
} }
pub fn with_field_name(self, name: &str) -> Self {
pub fn with_field_name(mut self, name: &str) -> Self { Self {
self.field_name = Some(name.to_string()); symbol: self.symbol,
self precedence: self.precedence,
associativity: self.associativity,
alias: self.alias,
field_name: Some(name.to_string()),
}
} }
} }
@ -253,7 +240,7 @@ impl InlinedProductionMap {
step_index: u32, step_index: u32,
) -> Option<impl Iterator<Item = &'a Production> + 'a> { ) -> Option<impl Iterator<Item = &'a Production> + 'a> {
self.production_map self.production_map
.get(&(std::ptr::from_ref::<Production>(production), step_index)) .get(&(production as *const Production, step_index))
.map(|production_indices| { .map(|production_indices| {
production_indices production_indices
.iter() .iter()

273
cli/src/generate/mod.rs Normal file
View file

@ -0,0 +1,273 @@
use std::{
env, fs,
io::Write,
path::{Path, PathBuf},
process::{Command, Stdio},
};
use anyhow::{anyhow, Context, Result};
use build_tables::build_tables;
use grammar_files::path_in_ignore;
use grammars::InputGrammar;
use lazy_static::lazy_static;
use parse_grammar::parse_grammar;
use prepare_grammar::prepare_grammar;
use regex::{Regex, RegexBuilder};
use render::render_c_code;
use semver::Version;
mod build_tables;
mod dedup;
mod grammar_files;
mod grammars;
mod nfa;
mod node_types;
pub mod parse_grammar;
mod prepare_grammar;
mod render;
mod rules;
mod tables;
pub use grammar_files::lookup_package_json_for_path;
lazy_static! {
static ref JSON_COMMENT_REGEX: Regex = RegexBuilder::new("^\\s*//.*")
.multi_line(true)
.build()
.unwrap();
}
struct GeneratedParser {
c_code: String,
node_types_json: String,
}
pub const ALLOC_HEADER: &str = include_str!("./templates/alloc.h");
pub fn generate_parser_in_directory(
repo_path: &Path,
grammar_path: Option<&str>,
abi_version: usize,
generate_bindings: bool,
report_symbol_name: Option<&str>,
js_runtime: Option<&str>,
) -> Result<()> {
let mut repo_path = repo_path.to_owned();
let mut grammar_path = grammar_path;
// Populate a new empty grammar directory.
if let Some(path) = grammar_path {
let path = PathBuf::from(path);
if !path
.try_exists()
.with_context(|| "Some error with specified path")?
{
fs::create_dir_all(&path)?;
grammar_path = None;
repo_path = path;
}
}
let grammar_path = grammar_path
.map(PathBuf::from)
.unwrap_or(repo_path.join("grammar.js"));
if repo_path.is_dir() && !grammar_path.exists() && !path_in_ignore(&repo_path) {
if let Some(dir_name) = repo_path
.file_name()
.map(|x| x.to_string_lossy().to_ascii_lowercase())
{
if let Some(language_name) = dir_name
.strip_prefix("tree-sitter-")
.or_else(|| Some(dir_name.as_ref()))
{
grammar_files::generate_grammar_files(&repo_path, language_name, false)?;
}
}
}
// Read the grammar file.
let grammar_json = load_grammar_file(&grammar_path, js_runtime)?;
let src_path = repo_path.join("src");
let header_path = src_path.join("tree_sitter");
// Ensure that the output directories exist.
fs::create_dir_all(&src_path)?;
fs::create_dir_all(&header_path)?;
if grammar_path.file_name().unwrap() != "grammar.json" {
fs::write(src_path.join("grammar.json"), &grammar_json)
.with_context(|| format!("Failed to write grammar.json to {src_path:?}"))?;
}
// Parse and preprocess the grammar.
let input_grammar = parse_grammar(&grammar_json)?;
// Generate the parser and related files.
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?;
write_file(&header_path.join("array.h"), tree_sitter::ARRAY_HEADER)?;
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
if !path_in_ignore(&repo_path) && grammar_path == repo_path.join("grammar.js") {
grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?;
}
Ok(())
}
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
let input_grammar = parse_grammar(&grammar_json)?;
let parser =
generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?;
Ok((input_grammar.name, parser.c_code))
}
fn generate_parser_for_grammar_with_opts(
input_grammar: &InputGrammar,
abi_version: usize,
report_symbol_name: Option<&str>,
) -> Result<GeneratedParser> {
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(input_grammar)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let node_types_json = node_types::generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
);
let tables = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
)?;
let c_code = render_c_code(
&input_grammar.name,
tables,
syntax_grammar,
lexical_grammar,
simple_aliases,
abi_version,
);
Ok(GeneratedParser {
c_code,
node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
})
}
pub fn load_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> Result<String> {
if grammar_path.is_dir() {
return Err(anyhow!(
"Path to a grammar file with `.js` or `.json` extension is required"
));
}
match grammar_path.extension().and_then(|e| e.to_str()) {
Some("js") => Ok(load_js_grammar_file(grammar_path, js_runtime)
.with_context(|| "Failed to load grammar.js")?),
Some("json") => {
Ok(fs::read_to_string(grammar_path).with_context(|| "Failed to load grammar.json")?)
}
_ => Err(anyhow!("Unknown grammar file extension: {grammar_path:?}",)),
}
}
fn load_js_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> Result<String> {
let grammar_path = fs::canonicalize(grammar_path)?;
#[cfg(windows)]
let grammar_path = url::Url::from_file_path(grammar_path)
.expect("Failed to convert path to URL")
.to_string();
let js_runtime = js_runtime.unwrap_or("node");
let mut js_command = Command::new(js_runtime);
match js_runtime {
"node" => {
js_command.args(["--input-type=module", "-"]);
}
"bun" => {
js_command.arg("-");
}
"deno" => {
js_command.args(["run", "--allow-all", "-"]);
}
_ => {}
}
let mut js_process = js_command
.env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.with_context(|| format!("Failed to run `{js_runtime}`"))?;
let mut js_stdin = js_process
.stdin
.take()
.with_context(|| format!("Failed to open stdin for {js_runtime}"))?;
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
.with_context(|| "Could not parse this package's version as semver.")?;
write!(
js_stdin,
"globalThis.TREE_SITTER_CLI_VERSION_MAJOR = {};
globalThis.TREE_SITTER_CLI_VERSION_MINOR = {};
globalThis.TREE_SITTER_CLI_VERSION_PATCH = {};",
cli_version.major, cli_version.minor, cli_version.patch,
)
.with_context(|| format!("Failed to write tree-sitter version to {js_runtime}'s stdin"))?;
js_stdin
.write(include_bytes!("./dsl.js"))
.with_context(|| format!("Failed to write grammar dsl to {js_runtime}'s stdin"))?;
drop(js_stdin);
let output = js_process
.wait_with_output()
.with_context(|| format!("Failed to read output from {js_runtime}"))?;
match output.status.code() {
None => panic!("{js_runtime} process was killed"),
Some(0) => {
let stdout = String::from_utf8(output.stdout)
.with_context(|| format!("Got invalid UTF8 from {js_runtime}"))?;
let mut grammar_json = &stdout[..];
if let Some(pos) = stdout.rfind('\n') {
// If there's a newline, split the last line from the rest of the output
let node_output = &stdout[..pos];
grammar_json = &stdout[pos + 1..];
let mut stdout = std::io::stdout().lock();
stdout.write_all(node_output.as_bytes())?;
stdout.write_all(b"\n")?;
stdout.flush()?;
}
Ok(serde_json::to_string_pretty(
&serde_json::from_str::<serde_json::Value>(grammar_json)
.with_context(|| "Failed to parse grammar JSON")?,
)
.with_context(|| "Failed to serialize grammar JSON")?
+ "\n")
}
Some(code) => Err(anyhow!("{js_runtime} process exited with status {code}")),
}
}
fn write_file(path: &Path, body: impl AsRef<[u8]>) -> Result<()> {
fs::write(path, body)
.with_context(|| format!("Failed to write {:?}", path.file_name().unwrap()))
}

View file

@ -58,8 +58,7 @@ impl CharacterSet {
/// Create a character set with a given *inclusive* range of characters. /// Create a character set with a given *inclusive* range of characters.
#[allow(clippy::single_range_in_vec_init)] #[allow(clippy::single_range_in_vec_init)]
#[cfg(test)] pub fn from_range(mut first: char, mut last: char) -> Self {
fn from_range(mut first: char, mut last: char) -> Self {
if first > last { if first > last {
swap(&mut first, &mut last); swap(&mut first, &mut last);
} }
@ -287,8 +286,7 @@ impl CharacterSet {
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or /// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
/// `other`, but is not present in both sets. /// `other`, but is not present in both sets.
#[cfg(test)] pub fn symmetric_difference(mut self, mut other: Self) -> Self {
fn symmetric_difference(mut self, mut other: Self) -> Self {
self.remove_intersection(&mut other); self.remove_intersection(&mut other);
self.add(&other) self.add(&other)
} }
@ -363,9 +361,9 @@ impl CharacterSet {
}) { }) {
Ok(ix) | Err(ix) => ix, Ok(ix) | Err(ix) => ix,
}; };
self.ranges self.ranges.get(ix).map_or(false, |range| {
.get(ix) range.start <= seek_range.start && range.end >= seek_range.end
.is_some_and(|range| range.start <= seek_range.start && range.end >= seek_range.end) })
} }
pub fn contains(&self, c: char) -> bool { pub fn contains(&self, c: char) -> bool {
@ -428,13 +426,11 @@ impl fmt::Debug for CharacterSet {
} }
impl Nfa { impl Nfa {
#[must_use]
pub const fn new() -> Self { pub const fn new() -> Self {
Self { states: Vec::new() } Self { states: Vec::new() }
} }
pub fn last_state_id(&self) -> u32 { pub fn last_state_id(&self) -> u32 {
assert!(!self.states.is_empty());
self.states.len() as u32 - 1 self.states.len() as u32 - 1
} }
} }
@ -950,19 +946,20 @@ mod tests {
assert_eq!( assert_eq!(
left.remove_intersection(&mut right), left.remove_intersection(&mut right),
row.intersection, row.intersection,
"row {i}a: {:?} && {:?}", "row {}a: {:?} && {:?}",
i,
row.left, row.left,
row.right row.right
); );
assert_eq!( assert_eq!(
left, row.left_only, left, row.left_only,
"row {i}a: {:?} - {:?}", "row {}a: {:?} - {:?}",
row.left, row.right i, row.left, row.right
); );
assert_eq!( assert_eq!(
right, row.right_only, right, row.right_only,
"row {i}a: {:?} - {:?}", "row {}a: {:?} - {:?}",
row.right, row.left i, row.right, row.left
); );
let mut left = row.left.clone(); let mut left = row.left.clone();
@ -970,25 +967,27 @@ mod tests {
assert_eq!( assert_eq!(
right.remove_intersection(&mut left), right.remove_intersection(&mut left),
row.intersection, row.intersection,
"row {i}b: {:?} && {:?}", "row {}b: {:?} && {:?}",
i,
row.left, row.left,
row.right row.right
); );
assert_eq!( assert_eq!(
left, row.left_only, left, row.left_only,
"row {i}b: {:?} - {:?}", "row {}b: {:?} - {:?}",
row.left, row.right i, row.left, row.right
); );
assert_eq!( assert_eq!(
right, row.right_only, right, row.right_only,
"row {i}b: {:?} - {:?}", "row {}b: {:?} - {:?}",
row.right, row.left i, row.right, row.left
); );
assert_eq!( assert_eq!(
row.left.clone().difference(row.right.clone()), row.left.clone().difference(row.right.clone()),
row.left_only, row.left_only,
"row {i}b: {:?} -- {:?}", "row {}b: {:?} -- {:?}",
i,
row.left, row.left,
row.right row.right
); );

View file

@ -1,7 +1,10 @@
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::{
cmp::Ordering,
collections::{BTreeMap, HashMap, HashSet},
};
use anyhow::{anyhow, Result};
use serde::Serialize; use serde::Serialize;
use thiserror::Error;
use super::{ use super::{
grammars::{LexicalGrammar, SyntaxGrammar, VariableType}, grammars::{LexicalGrammar, SyntaxGrammar, VariableType},
@ -29,15 +32,12 @@ pub struct VariableInfo {
} }
#[derive(Debug, Serialize, PartialEq, Eq, Default, PartialOrd, Ord)] #[derive(Debug, Serialize, PartialEq, Eq, Default, PartialOrd, Ord)]
#[cfg(feature = "load")]
pub struct NodeInfoJSON { pub struct NodeInfoJSON {
#[serde(rename = "type")] #[serde(rename = "type")]
kind: String, kind: String,
named: bool, named: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")] #[serde(skip_serializing_if = "std::ops::Not::not")]
root: bool, root: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
extra: bool,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
fields: Option<BTreeMap<String, FieldInfoJSON>>, fields: Option<BTreeMap<String, FieldInfoJSON>>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
@ -47,7 +47,6 @@ pub struct NodeInfoJSON {
} }
#[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg(feature = "load")]
pub struct NodeTypeJSON { pub struct NodeTypeJSON {
#[serde(rename = "type")] #[serde(rename = "type")]
kind: String, kind: String,
@ -55,7 +54,6 @@ pub struct NodeTypeJSON {
} }
#[derive(Debug, Serialize, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Serialize, PartialEq, Eq, PartialOrd, Ord)]
#[cfg(feature = "load")]
pub struct FieldInfoJSON { pub struct FieldInfoJSON {
multiple: bool, multiple: bool,
required: bool, required: bool,
@ -69,7 +67,6 @@ pub struct ChildQuantity {
multiple: bool, multiple: bool,
} }
#[cfg(feature = "load")]
impl Default for FieldInfoJSON { impl Default for FieldInfoJSON {
fn default() -> Self { fn default() -> Self {
Self { Self {
@ -105,7 +102,7 @@ impl ChildQuantity {
} }
} }
const fn append(&mut self, other: Self) { fn append(&mut self, other: Self) {
if other.exists { if other.exists {
if self.exists || other.multiple { if self.exists || other.multiple {
self.multiple = true; self.multiple = true;
@ -117,7 +114,7 @@ impl ChildQuantity {
} }
} }
const fn union(&mut self, other: Self) -> bool { fn union(&mut self, other: Self) -> bool {
let mut result = false; let mut result = false;
if !self.exists && other.exists { if !self.exists && other.exists {
result = true; result = true;
@ -135,14 +132,6 @@ impl ChildQuantity {
} }
} }
pub type VariableInfoResult<T> = Result<T, VariableInfoError>;
#[derive(Debug, Error, Serialize)]
pub enum VariableInfoError {
#[error("Grammar error: Supertype symbols must always have a single visible child, but `{0}` can have multiple")]
InvalidSupertype(String),
}
/// Compute a summary of the public-facing structure of each variable in the /// Compute a summary of the public-facing structure of each variable in the
/// grammar. Each variable in the grammar corresponds to a distinct public-facing /// grammar. Each variable in the grammar corresponds to a distinct public-facing
/// node type. /// node type.
@ -168,7 +157,7 @@ pub fn get_variable_info(
syntax_grammar: &SyntaxGrammar, syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap, default_aliases: &AliasMap,
) -> VariableInfoResult<Vec<VariableInfo>> { ) -> Result<Vec<VariableInfo>> {
let child_type_is_visible = |t: &ChildType| { let child_type_is_visible = |t: &ChildType| {
variable_type_for_child_type(t, syntax_grammar, lexical_grammar) >= VariableType::Anonymous variable_type_for_child_type(t, syntax_grammar, lexical_grammar) >= VariableType::Anonymous
}; };
@ -349,7 +338,13 @@ pub fn get_variable_info(
for supertype_symbol in &syntax_grammar.supertype_symbols { for supertype_symbol in &syntax_grammar.supertype_symbols {
if result[supertype_symbol.index].has_multi_step_production { if result[supertype_symbol.index].has_multi_step_production {
let variable = &syntax_grammar.variables[supertype_symbol.index]; let variable = &syntax_grammar.variables[supertype_symbol.index];
Err(VariableInfoError::InvalidSupertype(variable.name.clone()))?; return Err(anyhow!(
concat!(
"Grammar error: Supertype symbols must always ",
"have a single visible child, but `{}` can have multiple"
),
variable.name
));
} }
} }
@ -374,105 +369,12 @@ pub fn get_variable_info(
Ok(result) Ok(result)
} }
fn get_aliases_by_symbol(
syntax_grammar: &SyntaxGrammar,
default_aliases: &AliasMap,
) -> HashMap<Symbol, BTreeSet<Option<Alias>>> {
let mut aliases_by_symbol = HashMap::new();
for (symbol, alias) in default_aliases {
aliases_by_symbol.insert(*symbol, {
let mut aliases = BTreeSet::new();
aliases.insert(Some(alias.clone()));
aliases
});
}
for extra_symbol in &syntax_grammar.extra_symbols {
if !default_aliases.contains_key(extra_symbol) {
aliases_by_symbol
.entry(*extra_symbol)
.or_insert_with(BTreeSet::new)
.insert(None);
}
}
for variable in &syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
aliases_by_symbol
.entry(step.symbol)
.or_insert_with(BTreeSet::new)
.insert(
step.alias
.as_ref()
.or_else(|| default_aliases.get(&step.symbol))
.cloned(),
);
}
}
}
aliases_by_symbol.insert(
Symbol::non_terminal(0),
std::iter::once(&None).cloned().collect(),
);
aliases_by_symbol
}
pub fn get_supertype_symbol_map(
syntax_grammar: &SyntaxGrammar,
default_aliases: &AliasMap,
variable_info: &[VariableInfo],
) -> BTreeMap<Symbol, Vec<ChildType>> {
let aliases_by_symbol = get_aliases_by_symbol(syntax_grammar, default_aliases);
let mut supertype_symbol_map = BTreeMap::new();
let mut symbols_by_alias = HashMap::new();
for (symbol, aliases) in &aliases_by_symbol {
for alias in aliases.iter().flatten() {
symbols_by_alias
.entry(alias)
.or_insert_with(Vec::new)
.push(*symbol);
}
}
for (i, info) in variable_info.iter().enumerate() {
let symbol = Symbol::non_terminal(i);
if syntax_grammar.supertype_symbols.contains(&symbol) {
let subtypes = info.children.types.clone();
supertype_symbol_map.insert(symbol, subtypes);
}
}
supertype_symbol_map
}
#[cfg(feature = "load")]
pub type SuperTypeCycleResult<T> = Result<T, SuperTypeCycleError>;
#[derive(Debug, Error, Serialize)]
pub struct SuperTypeCycleError {
items: Vec<String>,
}
impl std::fmt::Display for SuperTypeCycleError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Dependency cycle detected in node types:")?;
for (i, item) in self.items.iter().enumerate() {
write!(f, " {item}")?;
if i < self.items.len() - 1 {
write!(f, ",")?;
}
}
Ok(())
}
}
#[cfg(feature = "load")]
pub fn generate_node_types_json( pub fn generate_node_types_json(
syntax_grammar: &SyntaxGrammar, syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap, default_aliases: &AliasMap,
variable_info: &[VariableInfo], variable_info: &[VariableInfo],
) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> { ) -> Vec<NodeInfoJSON> {
let mut node_types_json = BTreeMap::new(); let mut node_types_json = BTreeMap::new();
let child_type_to_node_type = |child_type: &ChildType| match child_type { let child_type_to_node_type = |child_type: &ChildType| match child_type {
@ -528,32 +430,41 @@ pub fn generate_node_types_json(
} }
}; };
let aliases_by_symbol = get_aliases_by_symbol(syntax_grammar, default_aliases); let mut aliases_by_symbol = HashMap::new();
for (symbol, alias) in default_aliases {
let empty = BTreeSet::new(); aliases_by_symbol.insert(*symbol, {
let extra_names = syntax_grammar let mut aliases = HashSet::new();
.extra_symbols aliases.insert(Some(alias.clone()));
.iter() aliases
.flat_map(|symbol| { });
}
for extra_symbol in &syntax_grammar.extra_symbols {
if !default_aliases.contains_key(extra_symbol) {
aliases_by_symbol aliases_by_symbol
.get(symbol) .entry(*extra_symbol)
.unwrap_or(&empty) .or_insert_with(HashSet::new)
.iter() .insert(None);
.map(|alias| { }
alias.as_ref().map_or( }
match symbol.kind { for variable in &syntax_grammar.variables {
SymbolType::NonTerminal => &syntax_grammar.variables[symbol.index].name, for production in &variable.productions {
SymbolType::Terminal => &lexical_grammar.variables[symbol.index].name, for step in &production.steps {
SymbolType::External => { aliases_by_symbol
&syntax_grammar.external_tokens[symbol.index].name .entry(step.symbol)
} .or_insert_with(HashSet::new)
_ => unreachable!(), .insert(
}, step.alias
|alias| &alias.value, .as_ref()
) .or_else(|| default_aliases.get(&step.symbol))
}) .cloned(),
}) );
.collect::<HashSet<_>>(); }
}
}
aliases_by_symbol.insert(
Symbol::non_terminal(0),
std::iter::once(&None).cloned().collect(),
);
let mut subtype_map = Vec::new(); let mut subtype_map = Vec::new();
for (i, info) in variable_info.iter().enumerate() { for (i, info) in variable_info.iter().enumerate() {
@ -567,7 +478,6 @@ pub fn generate_node_types_json(
kind: variable.name.clone(), kind: variable.name.clone(),
named: true, named: true,
root: false, root: false,
extra: extra_names.contains(&variable.name),
fields: None, fields: None,
children: None, children: None,
subtypes: None, subtypes: None,
@ -589,7 +499,10 @@ pub fn generate_node_types_json(
} else if !syntax_grammar.variables_to_inline.contains(&symbol) { } else if !syntax_grammar.variables_to_inline.contains(&symbol) {
// If a rule is aliased under multiple names, then its information // If a rule is aliased under multiple names, then its information
// contributes to multiple entries in the final JSON. // contributes to multiple entries in the final JSON.
for alias in aliases_by_symbol.get(&symbol).unwrap_or(&BTreeSet::new()) { for alias in aliases_by_symbol
.get(&Symbol::non_terminal(i))
.unwrap_or(&HashSet::new())
{
let kind; let kind;
let is_named; let is_named;
if let Some(alias) = alias { if let Some(alias) = alias {
@ -611,7 +524,6 @@ pub fn generate_node_types_json(
kind: kind.clone(), kind: kind.clone(),
named: is_named, named: is_named,
root: i == 0, root: i == 0,
extra: extra_names.contains(&kind),
fields: Some(BTreeMap::new()), fields: Some(BTreeMap::new()),
children: None, children: None,
subtypes: None, subtypes: None,
@ -650,40 +562,22 @@ pub fn generate_node_types_json(
} }
} }
// Sort the subtype map topologically so that subtypes are listed before their supertypes. // Sort the subtype map so that subtypes are listed before their supertypes.
let mut sorted_kinds = Vec::with_capacity(subtype_map.len());
let mut top_sort = topological_sort::TopologicalSort::<String>::new();
for (supertype, subtypes) in &subtype_map {
for subtype in subtypes {
top_sort.add_dependency(subtype.kind.clone(), supertype.kind.clone());
}
}
loop {
let mut next_kinds = top_sort.pop_all();
match (next_kinds.is_empty(), top_sort.is_empty()) {
(true, true) => break,
(true, false) => {
let mut items = top_sort.collect::<Vec<String>>();
items.sort();
return Err(SuperTypeCycleError { items });
}
(false, _) => {
next_kinds.sort();
sorted_kinds.extend(next_kinds);
}
}
}
subtype_map.sort_by(|a, b| { subtype_map.sort_by(|a, b| {
let a_idx = sorted_kinds.iter().position(|n| n.eq(&a.0.kind)).unwrap(); if b.1.contains(&a.0) {
let b_idx = sorted_kinds.iter().position(|n| n.eq(&b.0.kind)).unwrap(); Ordering::Less
a_idx.cmp(&b_idx) } else if a.1.contains(&b.0) {
Ordering::Greater
} else {
Ordering::Equal
}
}); });
for node_type_json in node_types_json.values_mut() { for node_type_json in node_types_json.values_mut() {
if node_type_json if node_type_json
.children .children
.as_ref() .as_ref()
.is_some_and(|c| c.types.is_empty()) .map_or(false, |c| c.types.is_empty())
{ {
node_type_json.children = None; node_type_json.children = None;
} }
@ -700,6 +594,7 @@ pub fn generate_node_types_json(
let mut anonymous_node_types = Vec::new(); let mut anonymous_node_types = Vec::new();
let empty = HashSet::new();
let regular_tokens = lexical_grammar let regular_tokens = lexical_grammar
.variables .variables
.iter() .iter()
@ -744,7 +639,6 @@ pub fn generate_node_types_json(
kind: name.clone(), kind: name.clone(),
named: true, named: true,
root: false, root: false,
extra: extra_names.contains(&name),
fields: None, fields: None,
children: None, children: None,
subtypes: None, subtypes: None,
@ -762,7 +656,6 @@ pub fn generate_node_types_json(
kind: name.clone(), kind: name.clone(),
named: false, named: false,
root: false, root: false,
extra: extra_names.contains(&name),
fields: None, fields: None,
children: None, children: None,
subtypes: None, subtypes: None,
@ -783,15 +676,11 @@ pub fn generate_node_types_json(
a_is_leaf.cmp(&b_is_leaf) a_is_leaf.cmp(&b_is_leaf)
}) })
.then_with(|| a.kind.cmp(&b.kind)) .then_with(|| a.kind.cmp(&b.kind))
.then_with(|| a.named.cmp(&b.named))
.then_with(|| a.root.cmp(&b.root))
.then_with(|| a.extra.cmp(&b.extra))
}); });
result.dedup(); result.dedup();
Ok(result) result
} }
#[cfg(feature = "load")]
fn process_supertypes(info: &mut FieldInfoJSON, subtype_map: &[(NodeTypeJSON, Vec<NodeTypeJSON>)]) { fn process_supertypes(info: &mut FieldInfoJSON, subtype_map: &[(NodeTypeJSON, Vec<NodeTypeJSON>)]) {
for (supertype, subtypes) in subtype_map { for (supertype, subtypes) in subtype_map {
if info.types.contains(supertype) { if info.types.contains(supertype) {
@ -828,20 +717,20 @@ fn extend_sorted<'a, T>(vec: &mut Vec<T>, values: impl IntoIterator<Item = &'a T
where where
T: 'a + Clone + Eq + Ord, T: 'a + Clone + Eq + Ord,
{ {
values.into_iter().fold(false, |acc, value| { values.into_iter().any(|value| {
if let Err(i) = vec.binary_search(value) { if let Err(i) = vec.binary_search(value) {
vec.insert(i, value.clone()); vec.insert(i, value.clone());
true true
} else { } else {
acc false
} }
}) })
} }
#[cfg(all(test, feature = "load"))] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::{
grammars::{ grammars::{
InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable, InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable,
}, },
@ -875,8 +764,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!(node_types.len(), 3); assert_eq!(node_types.len(), 3);
@ -886,7 +774,6 @@ mod tests {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -925,7 +812,6 @@ mod tests {
kind: ";".to_string(), kind: ";".to_string(),
named: false, named: false,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -937,7 +823,6 @@ mod tests {
kind: "v2".to_string(), kind: "v2".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -965,9 +850,7 @@ mod tests {
}, },
// This rule is not reachable from the start symbol, but // This rule is not reachable from the start symbol, but
// it is reachable from the 'extra_symbols' so it // it is reachable from the 'extra_symbols' so it
// should be present in the node_types. // should be present in the node_types
// But because it's only a literal, it will get replaced by
// a lexical variable.
Variable { Variable {
name: "v3".to_string(), name: "v3".to_string(),
kind: VariableType::Named, kind: VariableType::Named,
@ -975,8 +858,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!(node_types.len(), 4); assert_eq!(node_types.len(), 4);
@ -986,7 +868,6 @@ mod tests {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -1025,7 +906,6 @@ mod tests {
kind: ";".to_string(), kind: ";".to_string(),
named: false, named: false,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -1037,7 +917,6 @@ mod tests {
kind: "v2".to_string(), kind: "v2".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -1049,119 +928,6 @@ mod tests {
kind: "v3".to_string(), kind: "v3".to_string(),
named: true, named: true,
root: false, root: false,
extra: true,
subtypes: None,
children: None,
fields: None
}
);
}
#[test]
fn test_node_types_deeper_extras() {
let node_types = get_node_types(&InputGrammar {
extra_symbols: vec![Rule::named("v3")],
variables: vec![
Variable {
name: "v1".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::field("f1".to_string(), Rule::named("v2")),
Rule::field("f2".to_string(), Rule::string(";")),
]),
},
Variable {
name: "v2".to_string(),
kind: VariableType::Named,
rule: Rule::string("x"),
},
// This rule is not reachable from the start symbol, but
// it is reachable from the 'extra_symbols' so it
// should be present in the node_types.
// Because it is not just a literal, it won't get replaced
// by a lexical variable.
Variable {
name: "v3".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![Rule::string("y"), Rule::repeat(Rule::string("z"))]),
},
],
..Default::default()
})
.unwrap();
assert_eq!(node_types.len(), 6);
assert_eq!(
node_types[0],
NodeInfoJSON {
kind: "v1".to_string(),
named: true,
root: true,
extra: false,
subtypes: None,
children: None,
fields: Some(
vec![
(
"f1".to_string(),
FieldInfoJSON {
multiple: false,
required: true,
types: vec![NodeTypeJSON {
kind: "v2".to_string(),
named: true,
}]
}
),
(
"f2".to_string(),
FieldInfoJSON {
multiple: false,
required: true,
types: vec![NodeTypeJSON {
kind: ";".to_string(),
named: false,
}]
}
),
]
.into_iter()
.collect()
)
}
);
assert_eq!(
node_types[1],
NodeInfoJSON {
kind: "v3".to_string(),
named: true,
root: false,
extra: true,
subtypes: None,
children: None,
fields: Some(BTreeMap::default())
}
);
assert_eq!(
node_types[2],
NodeInfoJSON {
kind: ";".to_string(),
named: false,
root: false,
extra: false,
subtypes: None,
children: None,
fields: None
}
);
assert_eq!(
node_types[3],
NodeInfoJSON {
kind: "v2".to_string(),
named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -1200,8 +966,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
@ -1209,7 +974,6 @@ mod tests {
kind: "_v2".to_string(), kind: "_v2".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
fields: None, fields: None,
children: None, children: None,
subtypes: Some(vec![ subtypes: Some(vec![
@ -1234,7 +998,6 @@ mod tests {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -1290,8 +1053,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
@ -1299,7 +1061,6 @@ mod tests {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -1338,7 +1099,6 @@ mod tests {
kind: "v2".to_string(), kind: "v2".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: false, multiple: false,
@ -1376,8 +1136,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
@ -1385,7 +1144,6 @@ mod tests {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -1451,8 +1209,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!(node_types.iter().find(|t| t.kind == "foo_identifier"), None); assert_eq!(node_types.iter().find(|t| t.kind == "foo_identifier"), None);
assert_eq!( assert_eq!(
@ -1461,7 +1218,6 @@ mod tests {
kind: "identifier".to_string(), kind: "identifier".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None, fields: None,
@ -1473,7 +1229,6 @@ mod tests {
kind: "type_identifier".to_string(), kind: "type_identifier".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None, fields: None,
@ -1508,8 +1263,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
@ -1517,7 +1271,6 @@ mod tests {
kind: "a".to_string(), kind: "a".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -1558,8 +1311,7 @@ mod tests {
]), ]),
}], }],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types, node_types,
@ -1567,7 +1319,6 @@ mod tests {
kind: "script".to_string(), kind: "script".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
fields: Some(BTreeMap::new()), fields: Some(BTreeMap::new()),
children: None, children: None,
subtypes: None subtypes: None
@ -1607,8 +1358,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
&node_types &node_types
@ -1626,7 +1376,6 @@ mod tests {
kind: "a".to_string(), kind: "a".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -1683,7 +1432,6 @@ mod tests {
kind: "script".to_string(), kind: "script".to_string(),
named: true, named: true,
root: true, root: true,
extra: false,
subtypes: None, subtypes: None,
// Only one node // Only one node
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
@ -1727,8 +1475,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types.iter().map(|n| &n.kind).collect::<Vec<_>>(), node_types.iter().map(|n| &n.kind).collect::<Vec<_>>(),
@ -1740,7 +1487,6 @@ mod tests {
kind: "b".to_string(), kind: "b".to_string(),
named: true, named: true,
root: false, root: false,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -2055,7 +1801,7 @@ mod tests {
); );
} }
fn get_node_types(grammar: &InputGrammar) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> { fn get_node_types(grammar: &InputGrammar) -> Vec<NodeInfoJSON> {
let (syntax_grammar, lexical_grammar, _, default_aliases) = let (syntax_grammar, lexical_grammar, _, default_aliases) =
prepare_grammar(grammar).unwrap(); prepare_grammar(grammar).unwrap();
let variable_info = let variable_info =

View file

@ -0,0 +1,258 @@
use anyhow::{anyhow, Result};
use serde::Deserialize;
use serde_json::{Map, Value};
use super::{
grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType},
rules::{Precedence, Rule},
};
#[derive(Deserialize)]
#[serde(tag = "type")]
#[allow(non_camel_case_types)]
#[allow(clippy::upper_case_acronyms)]
enum RuleJSON {
ALIAS {
content: Box<RuleJSON>,
named: bool,
value: String,
},
BLANK,
STRING {
value: String,
},
PATTERN {
value: String,
flags: Option<String>,
},
SYMBOL {
name: String,
},
CHOICE {
members: Vec<RuleJSON>,
},
FIELD {
name: String,
content: Box<RuleJSON>,
},
SEQ {
members: Vec<RuleJSON>,
},
REPEAT {
content: Box<RuleJSON>,
},
REPEAT1 {
content: Box<RuleJSON>,
},
PREC_DYNAMIC {
value: i32,
content: Box<RuleJSON>,
},
PREC_LEFT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC_RIGHT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
TOKEN {
content: Box<RuleJSON>,
},
IMMEDIATE_TOKEN {
content: Box<RuleJSON>,
},
}
#[derive(Deserialize)]
#[serde(untagged)]
enum PrecedenceValueJSON {
Integer(i32),
Name(String),
}
#[derive(Deserialize)]
pub(crate) struct GrammarJSON {
pub(crate) name: String,
rules: Map<String, Value>,
#[serde(default)]
precedences: Vec<Vec<RuleJSON>>,
#[serde(default)]
conflicts: Vec<Vec<String>>,
#[serde(default)]
externals: Vec<RuleJSON>,
#[serde(default)]
extras: Vec<RuleJSON>,
#[serde(default)]
inline: Vec<String>,
#[serde(default)]
supertypes: Vec<String>,
word: Option<String>,
}
pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
let grammar_json = serde_json::from_str::<GrammarJSON>(input)?;
let mut variables = Vec::with_capacity(grammar_json.rules.len());
for (name, value) in grammar_json.rules {
variables.push(Variable {
name: name.clone(),
kind: VariableType::Named,
rule: parse_rule(serde_json::from_value(value)?),
});
}
let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
for list in grammar_json.precedences {
let mut ordering = Vec::with_capacity(list.len());
for entry in list {
ordering.push(match entry {
RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
_ => {
return Err(anyhow!(
"Invalid rule in precedences array. Only strings and symbols are allowed"
))
}
});
}
precedence_orderings.push(ordering);
}
let extra_symbols = grammar_json
.extras
.into_iter()
.try_fold(Vec::new(), |mut acc, item| {
let rule = parse_rule(item);
if let Rule::String(ref value) = rule {
if value.is_empty() {
return Err(anyhow!(
"Rules in the `extras` array must not contain empty strings"
));
}
}
acc.push(rule);
Ok(acc)
})?;
let external_tokens = grammar_json.externals.into_iter().map(parse_rule).collect();
Ok(InputGrammar {
name: grammar_json.name,
word_token: grammar_json.word,
expected_conflicts: grammar_json.conflicts,
supertype_symbols: grammar_json.supertypes,
variables_to_inline: grammar_json.inline,
precedence_orderings,
variables,
extra_symbols,
external_tokens,
})
}
fn parse_rule(json: RuleJSON) -> Rule {
match json {
RuleJSON::ALIAS {
content,
value,
named,
} => Rule::alias(parse_rule(*content), value, named),
RuleJSON::BLANK => Rule::Blank,
RuleJSON::STRING { value } => Rule::String(value),
RuleJSON::PATTERN { value, flags } => Rule::Pattern(
value,
flags.map_or(String::new(), |f| {
f.matches(|c| {
if c == 'i' {
true
} else {
// silently ignore unicode flags
if c != 'u' && c != 'v' {
eprintln!("Warning: unsupported flag {c}");
}
false
}
})
.collect()
}),
),
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
RuleJSON::FIELD { content, name } => Rule::field(name, parse_rule(*content)),
RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()),
RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)),
RuleJSON::REPEAT { content } => {
Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank])
}
RuleJSON::PREC { value, content } => Rule::prec(value.into(), parse_rule(*content)),
RuleJSON::PREC_LEFT { value, content } => {
Rule::prec_left(value.into(), parse_rule(*content))
}
RuleJSON::PREC_RIGHT { value, content } => {
Rule::prec_right(value.into(), parse_rule(*content))
}
RuleJSON::PREC_DYNAMIC { value, content } => {
Rule::prec_dynamic(value, parse_rule(*content))
}
RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)),
RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)),
}
}
impl From<PrecedenceValueJSON> for Precedence {
fn from(val: PrecedenceValueJSON) -> Self {
match val {
PrecedenceValueJSON::Integer(i) => Self::Integer(i),
PrecedenceValueJSON::Name(i) => Self::Name(i),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_grammar() {
let grammar = parse_grammar(
r#"{
"name": "my_lang",
"rules": {
"file": {
"type": "REPEAT1",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "STRING",
"value": "foo"
}
}
}"#,
)
.unwrap();
assert_eq!(grammar.name, "my_lang");
assert_eq!(
grammar.variables,
vec![
Variable {
name: "file".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
},
Variable {
name: "statement".to_string(),
kind: VariableType::Named,
rule: Rule::String("foo".to_string())
},
]
);
}
}

View file

@ -1,7 +1,7 @@
use std::{collections::HashMap, mem}; use std::{collections::HashMap, mem};
use super::ExtractedSyntaxGrammar; use super::ExtractedSyntaxGrammar;
use crate::{ use crate::generate::{
grammars::{Variable, VariableType}, grammars::{Variable, VariableType},
rules::{Rule, Symbol}, rules::{Rule, Symbol},
}; };

View file

@ -1,57 +1,41 @@
use regex_syntax::{ use std::collections::HashMap;
hir::{Class, Hir, HirKind},
ParserBuilder, use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex_syntax::ast::{
parse, Ast, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem, ClassUnicodeKind,
RepetitionKind, RepetitionRange,
}; };
use serde::Serialize;
use thiserror::Error;
use super::ExtractedLexicalGrammar; use super::ExtractedLexicalGrammar;
use crate::{ use crate::generate::{
grammars::{LexicalGrammar, LexicalVariable}, grammars::{LexicalGrammar, LexicalVariable},
nfa::{CharacterSet, Nfa, NfaState}, nfa::{CharacterSet, Nfa, NfaState},
rules::{Precedence, Rule}, rules::{Precedence, Rule},
}; };
lazy_static! {
static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
static ref UNICODE_CATEGORY_ALIASES: HashMap<&'static str, String> =
serde_json::from_str(UNICODE_CATEGORY_ALIASES_JSON).unwrap();
static ref UNICODE_PROPERTY_ALIASES: HashMap<&'static str, String> =
serde_json::from_str(UNICODE_PROPERTY_ALIASES_JSON).unwrap();
}
const UNICODE_CATEGORIES_JSON: &str = include_str!("./unicode-categories.json");
const UNICODE_PROPERTIES_JSON: &str = include_str!("./unicode-properties.json");
const UNICODE_CATEGORY_ALIASES_JSON: &str = include_str!("./unicode-category-aliases.json");
const UNICODE_PROPERTY_ALIASES_JSON: &str = include_str!("./unicode-property-aliases.json");
struct NfaBuilder { struct NfaBuilder {
nfa: Nfa, nfa: Nfa,
is_sep: bool, is_sep: bool,
precedence_stack: Vec<i32>, precedence_stack: Vec<i32>,
} }
pub type ExpandTokensResult<T> = Result<T, ExpandTokensError>;
#[derive(Debug, Error, Serialize)]
pub enum ExpandTokensError {
#[error(
"The rule `{0}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
"
)]
EmptyString(String),
#[error(transparent)]
Processing(ExpandTokensProcessingError),
#[error(transparent)]
ExpandRule(ExpandRuleError),
}
#[derive(Debug, Error, Serialize)]
pub struct ExpandTokensProcessingError {
rule: String,
error: ExpandRuleError,
}
impl std::fmt::Display for ExpandTokensProcessingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(
f,
"Error processing rule {}: Grammar error: Unexpected rule {:?}",
self.rule, self.error
)?;
Ok(())
}
}
fn get_implicit_precedence(rule: &Rule) -> i32 { fn get_implicit_precedence(rule: &Rule) -> i32 {
match rule { match rule {
Rule::String(_) => 2, Rule::String(_) => 2,
@ -75,7 +59,7 @@ const fn get_completion_precedence(rule: &Rule) -> i32 {
0 0
} }
pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult<LexicalGrammar> { pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder { let mut builder = NfaBuilder {
nfa: Nfa::new(), nfa: Nfa::new(),
is_sep: true, is_sep: true,
@ -89,12 +73,8 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult
Rule::repeat(Rule::choice(grammar.separators)) Rule::repeat(Rule::choice(grammar.separators))
}; };
let mut variables = Vec::with_capacity(grammar.variables.len()); let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() { for (i, variable) in grammar.variables.into_iter().enumerate() {
if variable.rule.is_empty() {
Err(ExpandTokensError::EmptyString(variable.name.clone()))?;
}
let is_immediate_token = match &variable.rule { let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token, Rule::Metadata { params, .. } => params.is_main_token,
_ => false, _ => false,
@ -108,19 +88,12 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult
let last_state_id = builder.nfa.last_state_id(); let last_state_id = builder.nfa.last_state_id();
builder builder
.expand_rule(&variable.rule, last_state_id) .expand_rule(&variable.rule, last_state_id)
.map_err(|e| { .with_context(|| format!("Error processing rule {}", variable.name))?;
ExpandTokensError::Processing(ExpandTokensProcessingError {
rule: variable.name.clone(),
error: e,
})
})?;
if !is_immediate_token { if !is_immediate_token {
builder.is_sep = true; builder.is_sep = true;
let last_state_id = builder.nfa.last_state_id(); let last_state_id = builder.nfa.last_state_id();
builder builder.expand_rule(&separator_rule, last_state_id)?;
.expand_rule(&separator_rule, last_state_id)
.map_err(ExpandTokensError::ExpandRule)?;
} }
variables.push(LexicalVariable { variables.push(LexicalVariable {
@ -137,64 +110,22 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult
}) })
} }
pub type ExpandRuleResult<T> = Result<T, ExpandRuleError>;
#[derive(Debug, Error, Serialize)]
pub enum ExpandRuleError {
#[error("Grammar error: Unexpected rule {0:?}")]
UnexpectedRule(Rule),
#[error("{0}")]
Parse(String),
#[error(transparent)]
ExpandRegex(ExpandRegexError),
}
pub type ExpandRegexResult<T> = Result<T, ExpandRegexError>;
#[derive(Debug, Error, Serialize)]
pub enum ExpandRegexError {
#[error("{0}")]
Utf8(String),
#[error("Regex error: Assertions are not supported")]
Assertion,
}
impl NfaBuilder { impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> ExpandRuleResult<bool> { fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule { match rule {
Rule::Pattern(s, f) => { Rule::Pattern(s, f) => {
// With unicode enabled, `\w`, `\s` and `\d` expand to character sets that are much let ast = parse::Parser::new().parse(s)?;
// larger than intended, so we replace them with the actual self.expand_regex(&ast, next_state_id, f.contains('i'))
// character sets they should represent. If the full unicode range
// of `\w`, `\s` or `\d` are needed then `\p{L}`, `\p{Z}` and `\p{N}` should be
// used.
let s = s
.replace(r"\w", r"[0-9A-Za-z_]")
.replace(r"\s", r"[\t-\r ]")
.replace(r"\d", r"[0-9]")
.replace(r"\W", r"[^0-9A-Za-z_]")
.replace(r"\S", r"[^\t-\r ]")
.replace(r"\D", r"[^0-9]");
let mut parser = ParserBuilder::new()
.case_insensitive(f.contains('i'))
.unicode(true)
.utf8(false)
.build();
let hir = parser
.parse(&s)
.map_err(|e| ExpandRuleError::Parse(e.to_string()))?;
self.expand_regex(&hir, next_state_id)
.map_err(ExpandRuleError::ExpandRegex)
} }
Rule::String(s) => { Rule::String(s) => {
for c in s.chars().rev() { for c in s.chars().rev() {
self.push_advance(CharacterSet::from_char(c), next_state_id); self.push_advance(CharacterSet::empty().add_char(c), next_state_id);
next_state_id = self.nfa.last_state_id(); next_state_id = self.nfa.last_state_id();
} }
Ok(!s.is_empty()) Ok(!s.is_empty())
} }
Rule::Choice(elements) => { Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::with_capacity(elements.len()); let mut alternative_state_ids = Vec::new();
for element in elements { for element in elements {
if self.expand_rule(element, next_state_id)? { if self.expand_rule(element, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id()); alternative_state_ids.push(self.nfa.last_state_id());
@ -248,98 +179,129 @@ impl NfaBuilder {
result result
} }
Rule::Blank => Ok(false), Rule::Blank => Ok(false),
_ => Err(ExpandRuleError::UnexpectedRule(rule.clone()))?, _ => Err(anyhow!("Grammar error: Unexpected rule {rule:?}")),
} }
} }
fn expand_regex(&mut self, hir: &Hir, mut next_state_id: u32) -> ExpandRegexResult<bool> { fn expand_regex(
match hir.kind() { &mut self,
HirKind::Empty => Ok(false), ast: &Ast,
HirKind::Literal(literal) => { mut next_state_id: u32,
for character in std::str::from_utf8(&literal.0) case_insensitive: bool,
.map_err(|e| ExpandRegexError::Utf8(e.to_string()))? ) -> Result<bool> {
.chars() const fn inverse_char(c: char) -> char {
.rev() match c {
{ 'a'..='z' => (c as u8 - b'a' + b'A') as char,
let char_set = CharacterSet::from_char(character); 'A'..='Z' => (c as u8 - b'A' + b'a') as char,
self.push_advance(char_set, next_state_id); c => c,
next_state_id = self.nfa.last_state_id(); }
} }
fn with_inverse_char(mut chars: CharacterSet) -> CharacterSet {
for char in chars.clone().chars() {
let inverted = inverse_char(char);
if char != inverted {
chars = chars.add_char(inverted);
}
}
chars
}
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(anyhow!("Regex error: Flags are not supported")),
Ast::Literal(literal) => {
let mut char_set = CharacterSet::from_char(literal.c);
if case_insensitive {
let inverted = inverse_char(literal.c);
if literal.c != inverted {
char_set = char_set.add_char(inverted);
}
}
self.push_advance(char_set, next_state_id);
Ok(true) Ok(true)
} }
HirKind::Class(class) => match class { Ast::Dot(_) => {
Class::Unicode(class) => { self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
let mut chars = CharacterSet::default(); Ok(true)
for c in class.ranges() { }
chars = chars.add_range(c.start(), c.end()); Ast::Assertion(_) => Err(anyhow!("Regex error: Assertions are not supported")),
} Ast::ClassUnicode(class) => {
let mut chars = self.expand_unicode_character_class(&class.kind)?;
// For some reason, the long s `ſ` is included if the letter `s` is in a if class.negated {
// pattern, so we remove it. chars = chars.negate();
if chars.range_count() == 3
&& chars
.ranges()
// exact check to ensure that `ſ` wasn't intentionally added.
.all(|r| ['s'..='s', 'S'..='S', 'ſ'..='ſ'].contains(&r))
{
chars = chars.difference(CharacterSet::from_char('ſ'));
}
self.push_advance(chars, next_state_id);
Ok(true)
} }
Class::Bytes(bytes_class) => { if case_insensitive {
let mut chars = CharacterSet::default(); chars = with_inverse_char(chars);
for c in bytes_class.ranges() {
chars = chars.add_range(c.start().into(), c.end().into());
}
self.push_advance(chars, next_state_id);
Ok(true)
} }
}, self.push_advance(chars, next_state_id);
HirKind::Look(_) => Err(ExpandRegexError::Assertion)?, Ok(true)
HirKind::Repetition(repetition) => match (repetition.min, repetition.max) { }
(0, Some(1)) => self.expand_zero_or_one(&repetition.sub, next_state_id), Ast::ClassPerl(class) => {
(1, None) => self.expand_one_or_more(&repetition.sub, next_state_id), let mut chars = self.expand_perl_character_class(&class.kind);
(0, None) => self.expand_zero_or_more(&repetition.sub, next_state_id), if class.negated {
(min, Some(max)) if min == max => { chars = chars.negate();
self.expand_count(&repetition.sub, min, next_state_id)
} }
(min, None) => { if case_insensitive {
if self.expand_zero_or_more(&repetition.sub, next_state_id)? { chars = with_inverse_char(chars);
self.expand_count(&repetition.sub, min, next_state_id) }
self.push_advance(chars, next_state_id);
Ok(true)
}
Ast::ClassBracketed(class) => {
let mut chars = self.translate_class_set(&class.kind)?;
if class.negated {
chars = chars.negate();
}
if case_insensitive {
chars = with_inverse_char(chars);
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id, case_insensitive)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)? {
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)
} else { } else {
Ok(false) Ok(false)
} }
} }
(min, Some(max)) => { RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.sub, min, next_state_id)?; let mut result =
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)?;
for _ in min..max { for _ in min..max {
if result { if result {
next_state_id = self.nfa.last_state_id(); next_state_id = self.nfa.last_state_id();
} }
if self.expand_zero_or_one(&repetition.sub, next_state_id)? { if self.expand_zero_or_one(
&repetition.ast,
next_state_id,
case_insensitive,
)? {
result = true; result = true;
} }
} }
Ok(result) Ok(result)
} }
}, },
HirKind::Capture(capture) => self.expand_regex(&capture.sub, next_state_id), Ast::Group(group) => self.expand_regex(&group.ast, next_state_id, case_insensitive),
HirKind::Concat(concat) => { Ast::Alternation(alternation) => {
let mut result = false; let mut alternative_state_ids = Vec::new();
for hir in concat.iter().rev() { for ast in &alternation.asts {
if self.expand_regex(hir, next_state_id)? { if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
HirKind::Alternation(alternations) => {
let mut alternative_state_ids = Vec::with_capacity(alternations.len());
for hir in alternations {
if self.expand_regex(hir, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id()); alternative_state_ids.push(self.nfa.last_state_id());
} else { } else {
alternative_state_ids.push(next_state_id); alternative_state_ids.push(next_state_id);
@ -348,21 +310,58 @@ impl NfaBuilder {
alternative_state_ids.sort_unstable(); alternative_state_ids.sort_unstable();
alternative_state_ids.dedup(); alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids { for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id); self.push_split(alternative_state_id);
} }
Ok(true) Ok(true)
} }
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
} }
} }
fn expand_one_or_more(&mut self, hir: &Hir, next_state_id: u32) -> ExpandRegexResult<bool> { fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
match &class_set {
ClassSet::Item(item) => self.expand_character_class(item),
ClassSet::BinaryOp(binary_op) => {
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
match binary_op.kind {
ClassSetBinaryOpKind::Intersection => {
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
}
ClassSetBinaryOpKind::Difference => {
Ok(lhs_char_class.difference(rhs_char_class))
}
ClassSetBinaryOpKind::SymmetricDifference => {
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
}
}
}
}
}
fn expand_one_or_more(
&mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
self.nfa.states.push(NfaState::Accept { self.nfa.states.push(NfaState::Accept {
variable_index: 0, variable_index: 0,
precedence: 0, precedence: 0,
}); // Placeholder for split }); // Placeholder for split
let split_state_id = self.nfa.last_state_id(); let split_state_id = self.nfa.last_state_id();
if self.expand_regex(hir, split_state_id)? { if self.expand_regex(ast, split_state_id, case_insensitive)? {
self.nfa.states[split_state_id as usize] = self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id); NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true) Ok(true)
@ -372,8 +371,13 @@ impl NfaBuilder {
} }
} }
fn expand_zero_or_one(&mut self, hir: &Hir, next_state_id: u32) -> ExpandRegexResult<bool> { fn expand_zero_or_one(
if self.expand_regex(hir, next_state_id)? { &mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
if self.expand_regex(ast, next_state_id, case_insensitive)? {
self.push_split(next_state_id); self.push_split(next_state_id);
Ok(true) Ok(true)
} else { } else {
@ -381,8 +385,13 @@ impl NfaBuilder {
} }
} }
fn expand_zero_or_more(&mut self, hir: &Hir, next_state_id: u32) -> ExpandRegexResult<bool> { fn expand_zero_or_more(
if self.expand_one_or_more(hir, next_state_id)? { &mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
if self.expand_one_or_more(ast, next_state_id, case_insensitive)? {
self.push_split(next_state_id); self.push_split(next_state_id);
Ok(true) Ok(true)
} else { } else {
@ -392,13 +401,14 @@ impl NfaBuilder {
fn expand_count( fn expand_count(
&mut self, &mut self,
hir: &Hir, ast: &Ast,
count: u32, count: u32,
mut next_state_id: u32, mut next_state_id: u32,
) -> ExpandRegexResult<bool> { case_insensitive: bool,
) -> Result<bool> {
let mut result = false; let mut result = false;
for _ in 0..count { for _ in 0..count {
if self.expand_regex(hir, next_state_id)? { if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true; result = true;
next_state_id = self.nfa.last_state_id(); next_state_id = self.nfa.last_state_id();
} }
@ -406,6 +416,111 @@ impl NfaBuilder {
Ok(result) Ok(result)
} }
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::empty()),
ClassSetItem::Literal(literal) => Ok(CharacterSet::from_char(literal.c)),
ClassSetItem::Range(range) => Ok(CharacterSet::from_range(range.start.c, range.end.c)),
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(&self.expand_character_class(item)?);
}
Ok(result)
}
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
ClassSetItem::Unicode(class) => {
let mut set = self.expand_unicode_character_class(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
ClassSetItem::Bracketed(class) => {
let mut set = self.translate_class_set(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
ClassSetItem::Ascii(_) => Err(anyhow!(
"Regex error: Unsupported character class syntax {item:?}",
)),
}
}
fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
let mut chars = CharacterSet::empty();
let category_letter;
match class {
ClassUnicodeKind::OneLetter(le) => {
category_letter = le.to_string();
}
ClassUnicodeKind::Named(class_name) => {
let actual_class_name = UNICODE_CATEGORY_ALIASES
.get(class_name.as_str())
.or_else(|| UNICODE_PROPERTY_ALIASES.get(class_name.as_str()))
.unwrap_or(class_name);
if actual_class_name.len() == 1 {
category_letter = actual_class_name.clone();
} else {
let code_points =
UNICODE_CATEGORIES
.get(actual_class_name.as_str())
.or_else(|| UNICODE_PROPERTIES.get(actual_class_name.as_str()))
.ok_or_else(|| {
anyhow!(
"Regex error: Unsupported unicode character class {class_name}",
)
})?;
for c in code_points {
if let Some(c) = char::from_u32(*c) {
chars = chars.add_char(c);
}
}
return Ok(chars);
}
}
ClassUnicodeKind::NamedValue { .. } => {
return Err(anyhow!(
"Regex error: Key-value unicode properties are not supported"
))
}
}
for (category, code_points) in UNICODE_CATEGORIES.iter() {
if category.starts_with(&category_letter) {
for c in code_points {
if let Some(c) = char::from_u32(*c) {
chars = chars.add_char(c);
}
}
}
}
Ok(chars)
}
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n')
.add_char('\x0B')
.add_char('\x0C'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
}
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) { fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
let precedence = *self.precedence_stack.last().unwrap(); let precedence = *self.precedence_stack.last().unwrap();
self.nfa.states.push(NfaState::Advance { self.nfa.states.push(NfaState::Advance {
@ -427,7 +542,7 @@ impl NfaBuilder {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::{
grammars::Variable, grammars::Variable,
nfa::{NfaCursor, NfaTransition}, nfa::{NfaCursor, NfaTransition},
}; };

View file

@ -1,4 +1,4 @@
use crate::{ use crate::generate::{
grammars::{LexicalGrammar, SyntaxGrammar}, grammars::{LexicalGrammar, SyntaxGrammar},
rules::{Alias, AliasMap, Symbol, SymbolType}, rules::{Alias, AliasMap, Symbol, SymbolType},
}; };
@ -69,7 +69,9 @@ pub(super) fn extract_default_aliases(
SymbolType::External => &mut external_status_list[symbol.index], SymbolType::External => &mut external_status_list[symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[symbol.index], SymbolType::NonTerminal => &mut non_terminal_status_list[symbol.index],
SymbolType::Terminal => &mut terminal_status_list[symbol.index], SymbolType::Terminal => &mut terminal_status_list[symbol.index],
SymbolType::End | SymbolType::EndOfNonTerminalExtra => panic!("Unexpected end token"), SymbolType::End | SymbolType::EndOfNonTerminalExtra => {
panic!("Unexpected end token")
}
}; };
status.appears_unaliased = true; status.appears_unaliased = true;
} }
@ -162,7 +164,7 @@ pub(super) fn extract_default_aliases(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::{
grammars::{LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType}, grammars::{LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType},
nfa::Nfa, nfa::Nfa,
}; };

View file

@ -1,63 +1,16 @@
use std::collections::HashMap; use std::{collections::HashMap, mem};
use serde::Serialize; use anyhow::{anyhow, Result};
use thiserror::Error;
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
use crate::{ use crate::generate::{
grammars::{ExternalToken, ReservedWordContext, Variable, VariableType}, grammars::{ExternalToken, Variable, VariableType},
rules::{MetadataParams, Rule, Symbol, SymbolType}, rules::{MetadataParams, Rule, Symbol, SymbolType},
}; };
pub type ExtractTokensResult<T> = Result<T, ExtractTokensError>;
#[derive(Debug, Error, Serialize)]
pub enum ExtractTokensError {
#[error(
"The rule `{0}` contains an empty string.
Tree-sitter does not support syntactic rules that contain an empty string
unless they are used only as the grammar's start rule.
"
)]
EmptyString(String),
#[error("Rule '{0}' cannot be used as both an external token and a non-terminal rule")]
ExternalTokenNonTerminal(String),
#[error("Non-symbol rules cannot be used as external tokens")]
NonSymbolExternalToken,
#[error(transparent)]
WordToken(NonTerminalWordTokenError),
#[error("Reserved word '{0}' must be a token")]
NonTokenReservedWord(String),
}
#[derive(Debug, Error, Serialize)]
pub struct NonTerminalWordTokenError {
pub symbol_name: String,
pub conflicting_symbol_name: Option<String>,
}
impl std::fmt::Display for NonTerminalWordTokenError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Non-terminal symbol '{}' cannot be used as the word token",
self.symbol_name
)?;
if let Some(conflicting_name) = &self.conflicting_symbol_name {
writeln!(
f,
", because its rule is duplicated in '{conflicting_name}'",
)
} else {
writeln!(f)
}
}
}
pub(super) fn extract_tokens( pub(super) fn extract_tokens(
mut grammar: InternedGrammar, mut grammar: InternedGrammar,
) -> ExtractTokensResult<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { ) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> {
let mut extractor = TokenExtractor { let mut extractor = TokenExtractor {
current_variable_name: String::new(), current_variable_name: String::new(),
current_variable_token_count: 0, current_variable_token_count: 0,
@ -85,7 +38,7 @@ pub(super) fn extract_tokens(
// that pointed to that variable will need to be updated to point to the // that pointed to that variable will need to be updated to point to the
// variable in the lexical grammar. Symbols that pointed to later variables // variable in the lexical grammar. Symbols that pointed to later variables
// will need to have their indices decremented. // will need to have their indices decremented.
let mut variables = Vec::with_capacity(grammar.variables.len()); let mut variables = Vec::new();
let mut symbol_replacer = SymbolReplacer { let mut symbol_replacer = SymbolReplacer {
replacements: HashMap::new(), replacements: HashMap::new(),
}; };
@ -152,14 +105,15 @@ pub(super) fn extract_tokens(
} }
} }
let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len()); let mut external_tokens = Vec::new();
for external_token in grammar.external_tokens { for external_token in grammar.external_tokens {
let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule);
if let Rule::Symbol(symbol) = rule { if let Rule::Symbol(symbol) = rule {
if symbol.is_non_terminal() { if symbol.is_non_terminal() {
Err(ExtractTokensError::ExternalTokenNonTerminal( return Err(anyhow!(
variables[symbol.index].name.clone(), "Rule '{}' cannot be used as both an external token and a non-terminal rule",
))?; &variables[symbol.index].name,
));
} }
if symbol.is_external() { if symbol.is_external() {
@ -176,59 +130,22 @@ pub(super) fn extract_tokens(
}); });
} }
} else { } else {
Err(ExtractTokensError::NonSymbolExternalToken)?; return Err(anyhow!(
"Non-symbol rules cannot be used as external tokens"
));
} }
} }
let word_token = if let Some(token) = grammar.word_token { let mut word_token = None;
if let Some(token) = grammar.word_token {
let token = symbol_replacer.replace_symbol(token); let token = symbol_replacer.replace_symbol(token);
if token.is_non_terminal() { if token.is_non_terminal() {
let word_token_variable = &variables[token.index]; return Err(anyhow!(
let conflicting_symbol_name = variables "Non-terminal symbol '{}' cannot be used as the word token",
.iter() &variables[token.index].name
.enumerate() ));
.find(|(i, v)| *i != token.index && v.rule == word_token_variable.rule)
.map(|(_, v)| v.name.clone());
Err(ExtractTokensError::WordToken(NonTerminalWordTokenError {
symbol_name: word_token_variable.name.clone(),
conflicting_symbol_name,
}))?;
} }
Some(token) word_token = Some(token);
} else {
None
};
let mut reserved_word_contexts = Vec::with_capacity(grammar.reserved_word_sets.len());
for reserved_word_context in grammar.reserved_word_sets {
let mut reserved_words = Vec::with_capacity(reserved_word_contexts.len());
for reserved_rule in reserved_word_context.reserved_words {
if let Rule::Symbol(symbol) = reserved_rule {
reserved_words.push(symbol_replacer.replace_symbol(symbol));
} else if let Some(index) = lexical_variables
.iter()
.position(|v| v.rule == reserved_rule)
{
reserved_words.push(Symbol::terminal(index));
} else {
let rule = if let Rule::Metadata { rule, .. } = &reserved_rule {
rule.as_ref()
} else {
&reserved_rule
};
let token_name = match rule {
Rule::String(s) => s.clone(),
Rule::Pattern(p, _) => p.clone(),
_ => "unknown".to_string(),
};
Err(ExtractTokensError::NonTokenReservedWord(token_name))?;
}
}
reserved_word_contexts.push(ReservedWordContext {
name: reserved_word_context.name,
reserved_words,
});
} }
Ok(( Ok((
@ -241,7 +158,6 @@ pub(super) fn extract_tokens(
external_tokens, external_tokens,
word_token, word_token,
precedence_orderings: grammar.precedence_orderings, precedence_orderings: grammar.precedence_orderings,
reserved_word_sets: reserved_word_contexts,
}, },
ExtractedLexicalGrammar { ExtractedLexicalGrammar {
variables: lexical_variables, variables: lexical_variables,
@ -267,16 +183,18 @@ impl TokenExtractor {
&mut self, &mut self,
is_first: bool, is_first: bool,
variable: &mut Variable, variable: &mut Variable,
) -> ExtractTokensResult<()> { ) -> Result<()> {
self.current_variable_name.clear(); self.current_variable_name.clear();
self.current_variable_name.push_str(&variable.name); self.current_variable_name.push_str(&variable.name);
self.current_variable_token_count = 0; self.current_variable_token_count = 0;
self.is_first_rule = is_first; self.is_first_rule = is_first;
variable.rule = self.extract_tokens_in_rule(&variable.rule)?; let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
variable.rule = self.extract_tokens_in_rule(&rule)?;
Ok(()) Ok(())
} }
fn extract_tokens_in_rule(&mut self, input: &Rule) -> ExtractTokensResult<Rule> { fn extract_tokens_in_rule(&mut self, input: &Rule) -> Result<Rule> {
match input { match input {
Rule::String(name) => Ok(self.extract_token(input, Some(name))?.into()), Rule::String(name) => Ok(self.extract_token(input, Some(name))?.into()),
Rule::Pattern(..) => Ok(self.extract_token(input, None)?.into()), Rule::Pattern(..) => Ok(self.extract_token(input, None)?.into()),
@ -285,11 +203,10 @@ impl TokenExtractor {
let mut params = params.clone(); let mut params = params.clone();
params.is_token = false; params.is_token = false;
let string_value = if let Rule::String(value) = rule.as_ref() { let mut string_value = None;
Some(value) if let Rule::String(value) = rule.as_ref() {
} else { string_value = Some(value);
None }
};
let rule_to_extract = if params == MetadataParams::default() { let rule_to_extract = if params == MetadataParams::default() {
rule.as_ref() rule.as_ref()
@ -312,27 +229,19 @@ impl TokenExtractor {
elements elements
.iter() .iter()
.map(|e| self.extract_tokens_in_rule(e)) .map(|e| self.extract_tokens_in_rule(e))
.collect::<ExtractTokensResult<Vec<_>>>()?, .collect::<Result<Vec<_>>>()?,
)), )),
Rule::Choice(elements) => Ok(Rule::Choice( Rule::Choice(elements) => Ok(Rule::Choice(
elements elements
.iter() .iter()
.map(|e| self.extract_tokens_in_rule(e)) .map(|e| self.extract_tokens_in_rule(e))
.collect::<ExtractTokensResult<Vec<_>>>()?, .collect::<Result<Vec<_>>>()?,
)), )),
Rule::Reserved { rule, context_name } => Ok(Rule::Reserved {
rule: Box::new(self.extract_tokens_in_rule(rule)?),
context_name: context_name.clone(),
}),
_ => Ok(input.clone()), _ => Ok(input.clone()),
} }
} }
fn extract_token( fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Result<Symbol> {
&mut self,
rule: &Rule,
string_value: Option<&String>,
) -> ExtractTokensResult<Symbol> {
for (i, variable) in self.extracted_variables.iter_mut().enumerate() { for (i, variable) in self.extracted_variables.iter_mut().enumerate() {
if variable.rule == *rule { if variable.rule == *rule {
self.extracted_usage_counts[i] += 1; self.extracted_usage_counts[i] += 1;
@ -343,9 +252,14 @@ impl TokenExtractor {
let index = self.extracted_variables.len(); let index = self.extracted_variables.len();
let variable = if let Some(string_value) = string_value { let variable = if let Some(string_value) = string_value {
if string_value.is_empty() && !self.is_first_rule { if string_value.is_empty() && !self.is_first_rule {
Err(ExtractTokensError::EmptyString( return Err(anyhow!(
self.current_variable_name.clone(), "The rule `{}` contains an empty string.
))?;
Tree-sitter does not support syntactic rules that contain an empty string
unless they are used only as the grammar's start rule.
",
self.current_variable_name
));
} }
Variable { Variable {
name: string_value.clone(), name: string_value.clone(),
@ -357,7 +271,7 @@ impl TokenExtractor {
Variable { Variable {
name: format!( name: format!(
"{}_token{}", "{}_token{}",
self.current_variable_name, self.current_variable_token_count &self.current_variable_name, self.current_variable_token_count
), ),
kind: VariableType::Auxiliary, kind: VariableType::Auxiliary,
rule: rule.clone(), rule: rule.clone(),
@ -391,10 +305,6 @@ impl SymbolReplacer {
params: params.clone(), params: params.clone(),
rule: Box::new(self.replace_symbols_in_rule(rule)), rule: Box::new(self.replace_symbols_in_rule(rule)),
}, },
Rule::Reserved { rule, context_name } => Rule::Reserved {
rule: Box::new(self.replace_symbols_in_rule(rule)),
context_name: context_name.clone(),
},
_ => rule.clone(), _ => rule.clone(),
} }
} }
@ -590,13 +500,14 @@ mod test {
]); ]);
grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))];
let result = extract_tokens(grammar); match extract_tokens(grammar) {
assert!(result.is_err(), "Expected an error but got no error"); Err(e) => {
let err = result.err().unwrap(); assert_eq!(e.to_string(), "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule");
assert_eq!( }
err.to_string(), _ => {
"Rule 'rule_1' cannot be used as both an external token and a non-terminal rule" panic!("Expected an error but got no error");
); }
}
} }
#[test] #[test]

View file

@ -1,96 +1,47 @@
use std::collections::HashMap; use anyhow::{anyhow, Result};
use serde::Serialize;
use thiserror::Error;
use super::ExtractedSyntaxGrammar; use super::ExtractedSyntaxGrammar;
use crate::{ use crate::generate::{
grammars::{ grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable},
Production, ProductionStep, ReservedWordSetId, SyntaxGrammar, SyntaxVariable, Variable, rules::{Alias, Associativity, Precedence, Rule, Symbol},
},
rules::{Alias, Associativity, Precedence, Rule, Symbol, TokenSet},
}; };
pub type FlattenGrammarResult<T> = Result<T, FlattenGrammarError>;
#[derive(Debug, Error, Serialize)]
pub enum FlattenGrammarError {
#[error("No such reserved word set: {0}")]
NoReservedWordSet(String),
#[error(
"The rule `{0}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
"
)]
EmptyString(String),
#[error("Rule `{0}` cannot be inlined because it contains a reference to itself")]
RecursiveInline(String),
}
struct RuleFlattener { struct RuleFlattener {
production: Production, production: Production,
reserved_word_set_ids: HashMap<String, ReservedWordSetId>,
precedence_stack: Vec<Precedence>, precedence_stack: Vec<Precedence>,
associativity_stack: Vec<Associativity>, associativity_stack: Vec<Associativity>,
reserved_word_stack: Vec<ReservedWordSetId>,
alias_stack: Vec<Alias>, alias_stack: Vec<Alias>,
field_name_stack: Vec<String>, field_name_stack: Vec<String>,
} }
impl RuleFlattener { impl RuleFlattener {
const fn new(reserved_word_set_ids: HashMap<String, ReservedWordSetId>) -> Self { const fn new() -> Self {
Self { Self {
production: Production { production: Production {
steps: Vec::new(), steps: Vec::new(),
dynamic_precedence: 0, dynamic_precedence: 0,
}, },
reserved_word_set_ids,
precedence_stack: Vec::new(), precedence_stack: Vec::new(),
associativity_stack: Vec::new(), associativity_stack: Vec::new(),
reserved_word_stack: Vec::new(),
alias_stack: Vec::new(), alias_stack: Vec::new(),
field_name_stack: Vec::new(), field_name_stack: Vec::new(),
} }
} }
fn flatten_variable(&mut self, variable: Variable) -> FlattenGrammarResult<SyntaxVariable> { fn flatten(mut self, rule: Rule) -> Production {
let choices = extract_choices(variable.rule); self.apply(rule, true);
let mut productions = Vec::with_capacity(choices.len()); self.production
for rule in choices {
let production = self.flatten_rule(rule)?;
if !productions.contains(&production) {
productions.push(production);
}
}
Ok(SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
})
} }
fn flatten_rule(&mut self, rule: Rule) -> FlattenGrammarResult<Production> { fn apply(&mut self, rule: Rule, at_end: bool) -> bool {
self.production = Production::default();
self.alias_stack.clear();
self.reserved_word_stack.clear();
self.precedence_stack.clear();
self.associativity_stack.clear();
self.field_name_stack.clear();
self.apply(rule, true)?;
Ok(self.production.clone())
}
fn apply(&mut self, rule: Rule, at_end: bool) -> FlattenGrammarResult<bool> {
match rule { match rule {
Rule::Seq(members) => { Rule::Seq(members) => {
let mut result = false; let mut result = false;
let last_index = members.len() - 1; let last_index = members.len() - 1;
for (i, member) in members.into_iter().enumerate() { for (i, member) in members.into_iter().enumerate() {
result |= self.apply(member, i == last_index && at_end)?; result |= self.apply(member, i == last_index && at_end);
} }
Ok(result) result
} }
Rule::Metadata { rule, params } => { Rule::Metadata { rule, params } => {
let mut has_precedence = false; let mut has_precedence = false;
@ -121,7 +72,7 @@ impl RuleFlattener {
self.production.dynamic_precedence = params.dynamic_precedence; self.production.dynamic_precedence = params.dynamic_precedence;
} }
let did_push = self.apply(*rule, at_end)?; let did_push = self.apply(*rule, at_end);
if has_precedence { if has_precedence {
self.precedence_stack.pop(); self.precedence_stack.pop();
@ -150,20 +101,7 @@ impl RuleFlattener {
self.field_name_stack.pop(); self.field_name_stack.pop();
} }
Ok(did_push) did_push
}
Rule::Reserved { rule, context_name } => {
self.reserved_word_stack.push(
self.reserved_word_set_ids
.get(&context_name)
.copied()
.ok_or_else(|| {
FlattenGrammarError::NoReservedWordSet(context_name.clone())
})?,
);
let did_push = self.apply(*rule, at_end)?;
self.reserved_word_stack.pop();
Ok(did_push)
} }
Rule::Symbol(symbol) => { Rule::Symbol(symbol) => {
self.production.steps.push(ProductionStep { self.production.steps.push(ProductionStep {
@ -174,17 +112,12 @@ impl RuleFlattener {
.cloned() .cloned()
.unwrap_or(Precedence::None), .unwrap_or(Precedence::None),
associativity: self.associativity_stack.last().copied(), associativity: self.associativity_stack.last().copied(),
reserved_word_set_id: self
.reserved_word_stack
.last()
.copied()
.unwrap_or(ReservedWordSetId::default()),
alias: self.alias_stack.last().cloned(), alias: self.alias_stack.last().cloned(),
field_name: self.field_name_stack.last().cloned(), field_name: self.field_name_stack.last().cloned(),
}); });
Ok(true) true
} }
_ => Ok(false), _ => false,
} }
} }
} }
@ -195,7 +128,7 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
let mut result = vec![Rule::Blank]; let mut result = vec![Rule::Blank];
for element in elements { for element in elements {
let extraction = extract_choices(element); let extraction = extract_choices(element);
let mut next_result = Vec::with_capacity(result.len()); let mut next_result = Vec::new();
for entry in result { for entry in result {
for extraction_entry in &extraction { for extraction_entry in &extraction {
next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()])); next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()]));
@ -206,7 +139,7 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
result result
} }
Rule::Choice(elements) => { Rule::Choice(elements) => {
let mut result = Vec::with_capacity(elements.len()); let mut result = Vec::new();
for element in elements { for element in elements {
for rule in extract_choices(element) { for rule in extract_choices(element) {
result.push(rule); result.push(rule);
@ -221,18 +154,26 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
params: params.clone(), params: params.clone(),
}) })
.collect(), .collect(),
Rule::Reserved { rule, context_name } => extract_choices(*rule)
.into_iter()
.map(|rule| Rule::Reserved {
rule: Box::new(rule),
context_name: context_name.clone(),
})
.collect(),
_ => vec![rule], _ => vec![rule],
} }
} }
fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool { fn flatten_variable(variable: Variable) -> SyntaxVariable {
let mut productions = Vec::new();
for rule in extract_choices(variable.rule) {
let production = RuleFlattener::new().flatten(rule);
if !productions.contains(&production) {
productions.push(production);
}
}
SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
}
}
pub fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool {
for variable in variables { for variable in variables {
for production in &variable.productions { for production in &variable.productions {
for step in &production.steps { for step in &production.steps {
@ -245,48 +186,36 @@ fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool {
false false
} }
pub(super) fn flatten_grammar( pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxGrammar> {
grammar: ExtractedSyntaxGrammar, let mut variables = Vec::new();
) -> FlattenGrammarResult<SyntaxGrammar> { for variable in grammar.variables {
let mut reserved_word_set_ids_by_name = HashMap::new(); variables.push(flatten_variable(variable));
for (ix, set) in grammar.reserved_word_sets.iter().enumerate() {
reserved_word_set_ids_by_name.insert(set.name.clone(), ReservedWordSetId(ix));
} }
let mut flattener = RuleFlattener::new(reserved_word_set_ids_by_name);
let variables = grammar
.variables
.into_iter()
.map(|variable| flattener.flatten_variable(variable))
.collect::<FlattenGrammarResult<Vec<_>>>()?;
for (i, variable) in variables.iter().enumerate() { for (i, variable) in variables.iter().enumerate() {
let symbol = Symbol::non_terminal(i); let symbol = Symbol::non_terminal(i);
let used = symbol_is_used(&variables, symbol);
for production in &variable.productions { for production in &variable.productions {
if used && production.steps.is_empty() { if production.steps.is_empty() && symbol_is_used(&variables, symbol) {
Err(FlattenGrammarError::EmptyString(variable.name.clone()))?; return Err(anyhow!(
"The rule `{}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
",
variable.name
));
} }
if grammar.variables_to_inline.contains(&symbol) if grammar.variables_to_inline.contains(&symbol)
&& production.steps.iter().any(|step| step.symbol == symbol) && production.steps.iter().any(|step| step.symbol == symbol)
{ {
Err(FlattenGrammarError::RecursiveInline(variable.name.clone()))?; return Err(anyhow!(
"Rule `{}` cannot be inlined because it contains a reference to itself.",
variable.name,
));
} }
} }
} }
let mut reserved_word_sets = grammar
.reserved_word_sets
.into_iter()
.map(|set| set.reserved_words.into_iter().collect())
.collect::<Vec<_>>();
// If no default reserved word set is specified, there are no reserved words.
if reserved_word_sets.is_empty() {
reserved_word_sets.push(TokenSet::default());
}
Ok(SyntaxGrammar { Ok(SyntaxGrammar {
extra_symbols: grammar.extra_symbols, extra_symbols: grammar.extra_symbols,
expected_conflicts: grammar.expected_conflicts, expected_conflicts: grammar.expected_conflicts,
@ -295,7 +224,6 @@ pub(super) fn flatten_grammar(
external_tokens: grammar.external_tokens, external_tokens: grammar.external_tokens,
supertype_symbols: grammar.supertype_symbols, supertype_symbols: grammar.supertype_symbols,
word_token: grammar.word_token, word_token: grammar.word_token,
reserved_word_sets,
variables, variables,
}) })
} }
@ -303,35 +231,32 @@ pub(super) fn flatten_grammar(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::grammars::VariableType; use crate::generate::grammars::VariableType;
#[test] #[test]
fn test_flatten_grammar() { fn test_flatten_grammar() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::seq(vec![
kind: VariableType::Named, Rule::non_terminal(1),
rule: Rule::seq(vec![ Rule::prec_left(
Rule::non_terminal(1), Precedence::Integer(101),
Rule::prec_left( Rule::seq(vec![
Precedence::Integer(101), Rule::non_terminal(2),
Rule::seq(vec![ Rule::choice(vec![
Rule::non_terminal(2), Rule::prec_right(
Rule::choice(vec![ Precedence::Integer(102),
Rule::prec_right( Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
Precedence::Integer(102), ),
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), Rule::non_terminal(5),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]), ]),
), Rule::non_terminal(6),
Rule::non_terminal(7), ]),
]), ),
}) Rule::non_terminal(7),
.unwrap(); ]),
});
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -368,31 +293,28 @@ mod tests {
#[test] #[test]
fn test_flatten_grammar_with_maximum_dynamic_precedence() { fn test_flatten_grammar_with_maximum_dynamic_precedence() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::seq(vec![
kind: VariableType::Named, Rule::non_terminal(1),
rule: Rule::seq(vec![ Rule::prec_dynamic(
Rule::non_terminal(1), 101,
Rule::prec_dynamic( Rule::seq(vec![
101, Rule::non_terminal(2),
Rule::seq(vec![ Rule::choice(vec![
Rule::non_terminal(2), Rule::prec_dynamic(
Rule::choice(vec![ 102,
Rule::prec_dynamic( Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
102, ),
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), Rule::non_terminal(5),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]), ]),
), Rule::non_terminal(6),
Rule::non_terminal(7), ]),
]), ),
}) Rule::non_terminal(7),
.unwrap(); ]),
});
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -424,17 +346,14 @@ mod tests {
#[test] #[test]
fn test_flatten_grammar_with_final_precedence() { fn test_flatten_grammar_with_final_precedence() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::prec_left(
kind: VariableType::Named, Precedence::Integer(101),
rule: Rule::prec_left( Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
Precedence::Integer(101), ),
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]), });
),
})
.unwrap();
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -449,16 +368,14 @@ mod tests {
}] }]
); );
let result = flattener let result = flatten_variable(Variable {
.flatten_variable(Variable { name: "test".to_string(),
name: "test".to_string(), kind: VariableType::Named,
kind: VariableType::Named, rule: Rule::prec_left(
rule: Rule::prec_left( Precedence::Integer(101),
Precedence::Integer(101), Rule::seq(vec![Rule::non_terminal(1)]),
Rule::seq(vec![Rule::non_terminal(1)]), ),
), });
})
.unwrap();
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -472,21 +389,18 @@ mod tests {
#[test] #[test]
fn test_flatten_grammar_with_field_names() { fn test_flatten_grammar_with_field_names() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::seq(vec![
kind: VariableType::Named, Rule::field("first-thing".to_string(), Rule::terminal(1)),
rule: Rule::seq(vec![ Rule::terminal(2),
Rule::field("first-thing".to_string(), Rule::terminal(1)), Rule::choice(vec![
Rule::terminal(2), Rule::Blank,
Rule::choice(vec![ Rule::field("second-thing".to_string(), Rule::terminal(3)),
Rule::Blank,
Rule::field("second-thing".to_string(), Rule::terminal(3)),
]),
]), ]),
}) ]),
.unwrap(); });
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -520,7 +434,6 @@ mod tests {
external_tokens: Vec::new(), external_tokens: Vec::new(),
supertype_symbols: Vec::new(), supertype_symbols: Vec::new(),
word_token: None, word_token: None,
reserved_word_sets: Vec::new(),
variables: vec![Variable { variables: vec![Variable {
name: "test".to_string(), name: "test".to_string(),
kind: VariableType::Named, kind: VariableType::Named,
@ -534,7 +447,7 @@ mod tests {
assert_eq!( assert_eq!(
result.unwrap_err().to_string(), result.unwrap_err().to_string(),
"Rule `test` cannot be inlined because it contains a reference to itself", "Rule `test` cannot be inlined because it contains a reference to itself.",
); );
} }
} }

View file

@ -1,34 +1,16 @@
use log::warn; use anyhow::{anyhow, Result};
use serde::Serialize;
use thiserror::Error;
use super::InternedGrammar; use super::InternedGrammar;
use crate::{ use crate::generate::{
grammars::{InputGrammar, ReservedWordContext, Variable, VariableType}, grammars::{InputGrammar, Variable, VariableType},
rules::{Rule, Symbol}, rules::{Rule, Symbol},
}; };
pub type InternSymbolsResult<T> = Result<T, InternSymbolsError>; pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar> {
#[derive(Debug, Error, Serialize)]
pub enum InternSymbolsError {
#[error("A grammar's start rule must be visible.")]
HiddenStartRule,
#[error("Undefined symbol `{0}`")]
Undefined(String),
#[error("Undefined symbol `{0}` in grammar's supertypes array")]
UndefinedSupertype(String),
#[error("Undefined symbol `{0}` in grammar's conflicts array")]
UndefinedConflict(String),
#[error("Undefined symbol `{0}` as grammar's word token")]
UndefinedWordToken(String),
}
pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<InternedGrammar> {
let interner = Interner { grammar }; let interner = Interner { grammar };
if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden {
Err(InternSymbolsError::HiddenStartRule)?; return Err(anyhow!("A grammar's start rule must be visible."));
} }
let mut variables = Vec::with_capacity(grammar.variables.len()); let mut variables = Vec::with_capacity(grammar.variables.len());
@ -58,31 +40,21 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len()); let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
for supertype_symbol_name in &grammar.supertype_symbols { for supertype_symbol_name in &grammar.supertype_symbols {
supertype_symbols.push(interner.intern_name(supertype_symbol_name).ok_or_else(|| { supertype_symbols.push(
InternSymbolsError::UndefinedSupertype(supertype_symbol_name.clone()) interner
})?); .intern_name(supertype_symbol_name)
.ok_or_else(|| anyhow!("Undefined symbol `{supertype_symbol_name}`"))?,
);
} }
let mut reserved_words = Vec::with_capacity(grammar.reserved_words.len()); let mut expected_conflicts = Vec::new();
for reserved_word_set in &grammar.reserved_words {
let mut interned_set = Vec::with_capacity(reserved_word_set.reserved_words.len());
for rule in &reserved_word_set.reserved_words {
interned_set.push(interner.intern_rule(rule, None)?);
}
reserved_words.push(ReservedWordContext {
name: reserved_word_set.name.clone(),
reserved_words: interned_set,
});
}
let mut expected_conflicts = Vec::with_capacity(grammar.expected_conflicts.len());
for conflict in &grammar.expected_conflicts { for conflict in &grammar.expected_conflicts {
let mut interned_conflict = Vec::with_capacity(conflict.len()); let mut interned_conflict = Vec::with_capacity(conflict.len());
for name in conflict { for name in conflict {
interned_conflict.push( interned_conflict.push(
interner interner
.intern_name(name) .intern_name(name)
.ok_or_else(|| InternSymbolsError::UndefinedConflict(name.clone()))?, .ok_or_else(|| anyhow!("Undefined symbol `{name}`"))?,
); );
} }
expected_conflicts.push(interned_conflict); expected_conflicts.push(interned_conflict);
@ -95,15 +67,14 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
} }
} }
let word_token = if let Some(name) = grammar.word_token.as_ref() { let mut word_token = None;
Some( if let Some(name) = grammar.word_token.as_ref() {
word_token = Some(
interner interner
.intern_name(name) .intern_name(name)
.ok_or_else(|| InternSymbolsError::UndefinedWordToken(name.clone()))?, .ok_or_else(|| anyhow!("Undefined symbol `{name}`"))?,
) );
} else { }
None
};
for (i, variable) in variables.iter_mut().enumerate() { for (i, variable) in variables.iter_mut().enumerate() {
if supertype_symbols.contains(&Symbol::non_terminal(i)) { if supertype_symbols.contains(&Symbol::non_terminal(i)) {
@ -120,7 +91,6 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
supertype_symbols, supertype_symbols,
word_token, word_token,
precedence_orderings: grammar.precedence_orderings.clone(), precedence_orderings: grammar.precedence_orderings.clone(),
reserved_word_sets: reserved_words,
}) })
} }
@ -128,11 +98,11 @@ struct Interner<'a> {
grammar: &'a InputGrammar, grammar: &'a InputGrammar,
} }
impl Interner<'_> { impl<'a> Interner<'a> {
fn intern_rule(&self, rule: &Rule, name: Option<&str>) -> InternSymbolsResult<Rule> { fn intern_rule(&self, rule: &Rule, name: Option<&str>) -> Result<Rule> {
match rule { match rule {
Rule::Choice(elements) => { Rule::Choice(elements) => {
self.check_single(elements, name, "choice"); self.check_single(elements, name);
let mut result = Vec::with_capacity(elements.len()); let mut result = Vec::with_capacity(elements.len());
for element in elements { for element in elements {
result.push(self.intern_rule(element, name)?); result.push(self.intern_rule(element, name)?);
@ -140,7 +110,7 @@ impl Interner<'_> {
Ok(Rule::Choice(result)) Ok(Rule::Choice(result))
} }
Rule::Seq(elements) => { Rule::Seq(elements) => {
self.check_single(elements, name, "seq"); self.check_single(elements, name);
let mut result = Vec::with_capacity(elements.len()); let mut result = Vec::with_capacity(elements.len());
for element in elements { for element in elements {
result.push(self.intern_rule(element, name)?); result.push(self.intern_rule(element, name)?);
@ -152,12 +122,8 @@ impl Interner<'_> {
rule: Box::new(self.intern_rule(rule, name)?), rule: Box::new(self.intern_rule(rule, name)?),
params: params.clone(), params: params.clone(),
}), }),
Rule::Reserved { rule, context_name } => Ok(Rule::Reserved {
rule: Box::new(self.intern_rule(rule, name)?),
context_name: context_name.clone(),
}),
Rule::NamedSymbol(name) => self.intern_name(name).map_or_else( Rule::NamedSymbol(name) => self.intern_name(name).map_or_else(
|| Err(InternSymbolsError::Undefined(name.clone())), || Err(anyhow!("Undefined symbol `{name}`")),
|symbol| Ok(Rule::Symbol(symbol)), |symbol| Ok(Rule::Symbol(symbol)),
), ),
_ => Ok(rule.clone()), _ => Ok(rule.clone()),
@ -184,10 +150,10 @@ impl Interner<'_> {
// In the case of a seq or choice rule of 1 element in a hidden rule, weird // In the case of a seq or choice rule of 1 element in a hidden rule, weird
// inconsistent behavior with queries can occur. So we should warn the user about it. // inconsistent behavior with queries can occur. So we should warn the user about it.
fn check_single(&self, elements: &[Rule], name: Option<&str>, kind: &str) { fn check_single(&self, elements: &[Rule], name: Option<&str>) {
if elements.len() == 1 && matches!(elements[0], Rule::String(_) | Rule::Pattern(_, _)) { if elements.len() == 1 && matches!(elements[0], Rule::String(_) | Rule::Pattern(_, _)) {
warn!( eprintln!(
"rule {} contains a `{kind}` rule with a single element. This is unnecessary.", "Warning: rule {} is just a `seq` or `choice` rule with a single element. This is unnecessary.",
name.unwrap_or_default() name.unwrap_or_default()
); );
} }
@ -278,9 +244,10 @@ mod tests {
fn test_grammar_with_undefined_symbols() { fn test_grammar_with_undefined_symbols() {
let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))]));
assert!(result.is_err(), "Expected an error but got none"); match result {
let e = result.err().unwrap(); Err(e) => assert_eq!(e.to_string(), "Undefined symbol `y`"),
assert_eq!(e.to_string(), "Undefined symbol `y`"); _ => panic!("Expected an error but got none"),
}
} }
fn build_grammar(variables: Vec<Variable>) -> InputGrammar { fn build_grammar(variables: Vec<Variable>) -> InputGrammar {

View file

@ -8,18 +8,12 @@ mod process_inlines;
use std::{ use std::{
cmp::Ordering, cmp::Ordering,
collections::{hash_map, BTreeSet, HashMap, HashSet}, collections::{hash_map, HashMap, HashSet},
mem, mem,
}; };
pub use expand_tokens::ExpandTokensError; use anyhow::{anyhow, Result};
pub use extract_tokens::ExtractTokensError; pub(super) use flatten_grammar::symbol_is_used;
pub use flatten_grammar::FlattenGrammarError;
use indexmap::IndexMap;
pub use intern_symbols::InternSymbolsError;
pub use process_inlines::ProcessInlinesError;
use serde::Serialize;
use thiserror::Error;
pub use self::expand_tokens::expand_tokens; pub use self::expand_tokens::expand_tokens;
use self::{ use self::{
@ -34,7 +28,6 @@ use super::{
}, },
rules::{AliasMap, Precedence, Rule, Symbol}, rules::{AliasMap, Precedence, Rule, Symbol},
}; };
use crate::grammars::ReservedWordContext;
pub struct IntermediateGrammar<T, U> { pub struct IntermediateGrammar<T, U> {
variables: Vec<Variable>, variables: Vec<Variable>,
@ -45,7 +38,6 @@ pub struct IntermediateGrammar<T, U> {
variables_to_inline: Vec<Symbol>, variables_to_inline: Vec<Symbol>,
supertype_symbols: Vec<Symbol>, supertype_symbols: Vec<Symbol>,
word_token: Option<Symbol>, word_token: Option<Symbol>,
reserved_word_sets: Vec<ReservedWordContext<T>>,
} }
pub type InternedGrammar = IntermediateGrammar<Rule, Variable>; pub type InternedGrammar = IntermediateGrammar<Rule, Variable>;
@ -69,96 +61,21 @@ impl<T, U> Default for IntermediateGrammar<T, U> {
variables_to_inline: Vec::default(), variables_to_inline: Vec::default(),
supertype_symbols: Vec::default(), supertype_symbols: Vec::default(),
word_token: Option::default(), word_token: Option::default(),
reserved_word_sets: Vec::default(),
} }
} }
} }
pub type PrepareGrammarResult<T> = Result<T, PrepareGrammarError>;
#[derive(Debug, Error, Serialize)]
#[error(transparent)]
pub enum PrepareGrammarError {
ValidatePrecedences(#[from] ValidatePrecedenceError),
ValidateIndirectRecursion(#[from] IndirectRecursionError),
InternSymbols(#[from] InternSymbolsError),
ExtractTokens(#[from] ExtractTokensError),
FlattenGrammar(#[from] FlattenGrammarError),
ExpandTokens(#[from] ExpandTokensError),
ProcessInlines(#[from] ProcessInlinesError),
}
pub type ValidatePrecedenceResult<T> = Result<T, ValidatePrecedenceError>;
#[derive(Debug, Error, Serialize)]
#[error(transparent)]
pub enum ValidatePrecedenceError {
Undeclared(#[from] UndeclaredPrecedenceError),
Ordering(#[from] ConflictingPrecedenceOrderingError),
}
#[derive(Debug, Error, Serialize)]
pub struct IndirectRecursionError(pub Vec<String>);
impl std::fmt::Display for IndirectRecursionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Grammar contains an indirectly recursive rule: ")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, " -> ")?;
}
write!(f, "{symbol}")?;
}
Ok(())
}
}
#[derive(Debug, Error, Serialize)]
pub struct UndeclaredPrecedenceError {
pub precedence: String,
pub rule: String,
}
impl std::fmt::Display for UndeclaredPrecedenceError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Undeclared precedence '{}' in rule '{}'",
self.precedence, self.rule
)?;
Ok(())
}
}
#[derive(Debug, Error, Serialize)]
pub struct ConflictingPrecedenceOrderingError {
pub precedence_1: String,
pub precedence_2: String,
}
impl std::fmt::Display for ConflictingPrecedenceOrderingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Conflicting orderings for precedences {} and {}",
self.precedence_1, self.precedence_2
)?;
Ok(())
}
}
/// Transform an input grammar into separate components that are ready /// Transform an input grammar into separate components that are ready
/// for parse table construction. /// for parse table construction.
pub fn prepare_grammar( pub fn prepare_grammar(
input_grammar: &InputGrammar, input_grammar: &InputGrammar,
) -> PrepareGrammarResult<( ) -> Result<(
SyntaxGrammar, SyntaxGrammar,
LexicalGrammar, LexicalGrammar,
InlinedProductionMap, InlinedProductionMap,
AliasMap, AliasMap,
)> { )> {
validate_precedences(input_grammar)?; validate_precedences(input_grammar)?;
validate_indirect_recursion(input_grammar)?;
let interned_grammar = intern_symbols(input_grammar)?; let interned_grammar = intern_symbols(input_grammar)?;
let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
@ -170,94 +87,13 @@ pub fn prepare_grammar(
Ok((syntax_grammar, lexical_grammar, inlines, default_aliases)) Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
} }
/// Check for indirect recursion cycles in the grammar that can cause infinite loops while
/// parsing. An indirect recursion cycle occurs when a non-terminal can derive itself through
/// a chain of single-symbol productions (e.g., A -> B, B -> A).
fn validate_indirect_recursion(grammar: &InputGrammar) -> Result<(), IndirectRecursionError> {
let mut epsilon_transitions: IndexMap<&str, BTreeSet<String>> = IndexMap::new();
for variable in &grammar.variables {
let productions = get_single_symbol_productions(&variable.rule);
// Filter out rules that *directly* reference themselves, as this doesn't
// cause a parsing loop.
let filtered: BTreeSet<String> = productions
.into_iter()
.filter(|s| s != &variable.name)
.collect();
epsilon_transitions.insert(variable.name.as_str(), filtered);
}
for start_symbol in epsilon_transitions.keys() {
let mut visited = BTreeSet::new();
let mut path = Vec::new();
if let Some((start_idx, end_idx)) =
get_cycle(start_symbol, &epsilon_transitions, &mut visited, &mut path)
{
let cycle_symbols = path[start_idx..=end_idx]
.iter()
.map(|s| (*s).to_string())
.collect();
return Err(IndirectRecursionError(cycle_symbols));
}
}
Ok(())
}
fn get_single_symbol_productions(rule: &Rule) -> BTreeSet<String> {
match rule {
Rule::NamedSymbol(name) => BTreeSet::from([name.clone()]),
Rule::Choice(choices) => choices
.iter()
.flat_map(get_single_symbol_productions)
.collect(),
Rule::Metadata { rule, .. } => get_single_symbol_productions(rule),
_ => BTreeSet::new(),
}
}
/// Perform a depth-first search to detect cycles in single state transitions.
fn get_cycle<'a>(
current: &'a str,
transitions: &'a IndexMap<&'a str, BTreeSet<String>>,
visited: &mut BTreeSet<&'a str>,
path: &mut Vec<&'a str>,
) -> Option<(usize, usize)> {
if let Some(first_idx) = path.iter().position(|s| *s == current) {
path.push(current);
return Some((first_idx, path.len() - 1));
}
if visited.contains(current) {
return None;
}
path.push(current);
visited.insert(current);
if let Some(next_symbols) = transitions.get(current) {
for next in next_symbols {
if let Some(cycle) = get_cycle(next, transitions, visited, path) {
return Some(cycle);
}
}
}
path.pop();
None
}
/// Check that all of the named precedences used in the grammar are declared /// Check that all of the named precedences used in the grammar are declared
/// within the `precedences` lists, and also that there are no conflicting /// within the `precedences` lists, and also that there are no conflicting
/// precedence orderings declared in those lists. /// precedence orderings declared in those lists.
fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()> { fn validate_precedences(grammar: &InputGrammar) -> Result<()> {
// Check that no rule contains a named precedence that is not present in // Check that no rule contains a named precedence that is not present in
// any of the `precedences` lists. // any of the `precedences` lists.
fn validate( fn validate(rule_name: &str, rule: &Rule, names: &HashSet<&String>) -> Result<()> {
rule_name: &str,
rule: &Rule,
names: &HashSet<&String>,
) -> ValidatePrecedenceResult<()> {
match rule { match rule {
Rule::Repeat(rule) => validate(rule_name, rule, names), Rule::Repeat(rule) => validate(rule_name, rule, names),
Rule::Seq(elements) | Rule::Choice(elements) => elements Rule::Seq(elements) | Rule::Choice(elements) => elements
@ -266,10 +102,7 @@ fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()>
Rule::Metadata { rule, params } => { Rule::Metadata { rule, params } => {
if let Precedence::Name(n) = &params.precedence { if let Precedence::Name(n) = &params.precedence {
if !names.contains(n) { if !names.contains(n) {
Err(UndeclaredPrecedenceError { return Err(anyhow!("Undeclared precedence '{n}' in rule '{rule_name}'"));
precedence: n.clone(),
rule: rule_name.to_string(),
})?;
} }
} }
validate(rule_name, rule, names)?; validate(rule_name, rule, names)?;
@ -299,10 +132,9 @@ fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()>
} }
hash_map::Entry::Occupied(e) => { hash_map::Entry::Occupied(e) => {
if e.get() != &ordering { if e.get() != &ordering {
Err(ConflictingPrecedenceOrderingError { return Err(anyhow!(
precedence_1: entry1.to_string(), "Conflicting orderings for precedences {entry1} and {entry2}",
precedence_2: entry2.to_string(), ));
})?;
} }
} }
} }
@ -332,7 +164,7 @@ fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()>
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::grammars::VariableType; use crate::generate::grammars::VariableType;
#[test] #[test]
fn test_validate_precedences_with_undeclared_precedence() { fn test_validate_precedences_with_undeclared_precedence() {

View file

@ -1,9 +1,8 @@
use std::collections::HashMap; use std::collections::HashMap;
use serde::Serialize; use anyhow::{anyhow, Result};
use thiserror::Error;
use crate::{ use crate::generate::{
grammars::{InlinedProductionMap, LexicalGrammar, Production, ProductionStep, SyntaxGrammar}, grammars::{InlinedProductionMap, LexicalGrammar, Production, ProductionStep, SyntaxGrammar},
rules::SymbolType, rules::SymbolType,
}; };
@ -70,13 +69,12 @@ impl InlinedProductionMapBuilder {
let production_map = production_indices_by_step_id let production_map = production_indices_by_step_id
.into_iter() .into_iter()
.map(|(step_id, production_indices)| { .map(|(step_id, production_indices)| {
let production = let production = step_id.variable_index.map_or_else(
core::ptr::from_ref::<Production>(step_id.variable_index.map_or_else( || &productions[step_id.production_index],
|| &productions[step_id.production_index], |variable_index| {
|variable_index| { &grammar.variables[variable_index].productions[step_id.production_index]
&grammar.variables[variable_index].productions[step_id.production_index] },
}, ) as *const Production;
));
((production, step_id.step_index as u32), production_indices) ((production, step_id.step_index as u32), production_indices)
}) })
.collect(); .collect();
@ -189,38 +187,29 @@ impl InlinedProductionMapBuilder {
} }
} }
pub type ProcessInlinesResult<T> = Result<T, ProcessInlinesError>;
#[derive(Debug, Error, Serialize)]
pub enum ProcessInlinesError {
#[error("External token `{0}` cannot be inlined")]
ExternalToken(String),
#[error("Token `{0}` cannot be inlined")]
Token(String),
#[error("Rule `{0}` cannot be inlined because it is the first rule")]
FirstRule(String),
}
pub(super) fn process_inlines( pub(super) fn process_inlines(
grammar: &SyntaxGrammar, grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
) -> ProcessInlinesResult<InlinedProductionMap> { ) -> Result<InlinedProductionMap> {
for symbol in &grammar.variables_to_inline { for symbol in &grammar.variables_to_inline {
match symbol.kind { match symbol.kind {
SymbolType::External => { SymbolType::External => {
Err(ProcessInlinesError::ExternalToken( return Err(anyhow!(
grammar.external_tokens[symbol.index].name.clone(), "External token `{}` cannot be inlined",
))?; grammar.external_tokens[symbol.index].name
))
} }
SymbolType::Terminal => { SymbolType::Terminal => {
Err(ProcessInlinesError::Token( return Err(anyhow!(
lexical_grammar.variables[symbol.index].name.clone(), "Token `{}` cannot be inlined",
))?; lexical_grammar.variables[symbol.index].name,
))
} }
SymbolType::NonTerminal if symbol.index == 0 => { SymbolType::NonTerminal if symbol.index == 0 => {
Err(ProcessInlinesError::FirstRule( return Err(anyhow!(
grammar.variables[symbol.index].name.clone(), "Rule `{}` cannot be inlined because it is the first rule",
))?; grammar.variables[symbol.index].name,
))
} }
_ => {} _ => {}
} }
@ -236,7 +225,7 @@ pub(super) fn process_inlines(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::{
grammars::{LexicalVariable, SyntaxVariable, VariableType}, grammars::{LexicalVariable, SyntaxVariable, VariableType},
rules::{Associativity, Precedence, Symbol}, rules::{Associativity, Precedence, Symbol},
}; };
@ -549,9 +538,10 @@ mod tests {
..Default::default() ..Default::default()
}; };
let result = process_inlines(&grammar, &lexical_grammar); if let Err(error) = process_inlines(&grammar, &lexical_grammar) {
assert!(result.is_err(), "expected an error, but got none"); assert_eq!(error.to_string(), "Token `something` cannot be inlined");
let err = result.err().unwrap(); } else {
assert_eq!(err.to_string(), "Token `something` cannot be inlined",); panic!("expected an error, but got none");
}
} }
} }

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
{"Other":"C","Control":"Cc","cntrl":"Cc","Format":"Cf","Unassigned":"Cn","Private_Use":"Co","Surrogate":"Cs","Letter":"L","Cased_Letter":"LC","Lowercase_Letter":"Ll","Modifier_Letter":"Lm","Other_Letter":"Lo","Titlecase_Letter":"Lt","Uppercase_Letter":"Lu","Mark":"M","Combining_Mark":"M","Spacing_Mark":"Mc","Enclosing_Mark":"Me","Nonspacing_Mark":"Mn","Number":"N","Decimal_Number":"Nd","digit":"Nd","Letter_Number":"Nl","Other_Number":"No","Punctuation":"P","punct":"P","Connector_Punctuation":"Pc","Dash_Punctuation":"Pd","Close_Punctuation":"Pe","Final_Punctuation":"Pf","Initial_Punctuation":"Pi","Other_Punctuation":"Po","Open_Punctuation":"Ps","Symbol":"S","Currency_Symbol":"Sc","Modifier_Symbol":"Sk","Math_Symbol":"Sm","Other_Symbol":"So","Separator":"Z","Line_Separator":"Zl","Paragraph_Separator":"Zp","Space_Separator":"Zs"}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","EqUIdeo":"Equivalent_Unified_Ideograph","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","NFKC_SCF":"NFKC_Simple_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InCB":"Indic_Conjunct_Break","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","ID_Compat_Math_Continue":"ID_Compat_Math_Continue","ID_Compat_Math_Start":"ID_Compat_Math_Start","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","IDSU":"IDS_Unary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"}

View file

@ -1,19 +1,15 @@
use std::{ use std::{
cmp, cmp,
collections::{BTreeMap, BTreeSet, HashMap, HashSet}, collections::{HashMap, HashSet},
fmt::Write, fmt::Write,
mem::swap, mem::swap,
}; };
use crate::LANGUAGE_VERSION;
use indoc::indoc;
use super::{ use super::{
build_tables::Tables, build_tables::Tables,
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}, grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
nfa::CharacterSet, nfa::CharacterSet,
node_types::ChildType, rules::{Alias, AliasMap, Symbol, SymbolType},
rules::{Alias, AliasMap, Symbol, SymbolType, TokenSet},
tables::{ tables::{
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable, AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
ParseTableEntry, ParseTableEntry,
@ -21,11 +17,10 @@ use super::{
}; };
const SMALL_STATE_THRESHOLD: usize = 64; const SMALL_STATE_THRESHOLD: usize = 64;
pub const ABI_VERSION_MIN: usize = 14; const ABI_VERSION_MIN: usize = 13;
pub const ABI_VERSION_MAX: usize = LANGUAGE_VERSION; const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION;
const ABI_VERSION_WITH_RESERVED_WORDS: usize = 15; const ABI_VERSION_WITH_PRIMARY_STATES: usize = 14;
#[clippy::format_args]
macro_rules! add { macro_rules! add {
($this: tt, $($arg: tt)*) => {{ ($this: tt, $($arg: tt)*) => {{
$this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); $this.buffer.write_fmt(format_args!($($arg)*)).unwrap();
@ -34,15 +29,12 @@ macro_rules! add {
macro_rules! add_whitespace { macro_rules! add_whitespace {
($this:tt) => {{ ($this:tt) => {{
// 4 bytes per char, 2 spaces per indent level
$this.buffer.reserve(4 * 2 * $this.indent_level);
for _ in 0..$this.indent_level { for _ in 0..$this.indent_level {
write!(&mut $this.buffer, " ").unwrap(); write!(&mut $this.buffer, " ").unwrap();
} }
}}; }};
} }
#[clippy::format_args]
macro_rules! add_line { macro_rules! add_line {
($this: tt, $($arg: tt)*) => { ($this: tt, $($arg: tt)*) => {
add_whitespace!($this); add_whitespace!($this);
@ -64,7 +56,6 @@ macro_rules! dedent {
}; };
} }
#[derive(Default)]
struct Generator { struct Generator {
buffer: String, buffer: String,
indent_level: usize, indent_level: usize,
@ -75,6 +66,7 @@ struct Generator {
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>, large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
large_character_set_info: Vec<LargeCharacterSetInfo>, large_character_set_info: Vec<LargeCharacterSetInfo>,
large_state_count: usize, large_state_count: usize,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar, syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar, lexical_grammar: LexicalGrammar,
default_aliases: AliasMap, default_aliases: AliasMap,
@ -83,13 +75,10 @@ struct Generator {
alias_ids: HashMap<Alias, String>, alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<Alias>, unique_aliases: Vec<Alias>,
symbol_map: HashMap<Symbol, Symbol>, symbol_map: HashMap<Symbol, Symbol>,
reserved_word_sets: Vec<TokenSet>,
reserved_word_set_ids_by_parse_state: Vec<usize>,
field_names: Vec<String>, field_names: Vec<String>,
supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
supertype_map: BTreeMap<String, Vec<ChildType>>, #[allow(unused)]
abi_version: usize, abi_version: usize,
metadata: Option<Metadata>,
} }
struct LargeCharacterSetInfo { struct LargeCharacterSetInfo {
@ -97,16 +86,9 @@ struct LargeCharacterSetInfo {
is_used: bool, is_used: bool,
} }
struct Metadata {
major_version: u8,
minor_version: u8,
patch_version: u8,
}
impl Generator { impl Generator {
fn generate(mut self) -> String { fn generate(mut self) -> String {
self.init(); self.init();
self.add_header();
self.add_includes(); self.add_includes();
self.add_pragmas(); self.add_pragmas();
self.add_stats(); self.add_stats();
@ -126,10 +108,9 @@ impl Generator {
} }
self.add_non_terminal_alias_map(); self.add_non_terminal_alias_map();
self.add_primary_state_id_list();
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS && !self.supertype_map.is_empty() { if self.abi_version >= ABI_VERSION_WITH_PRIMARY_STATES {
self.add_supertype_map(); self.add_primary_state_id_list();
} }
let buffer_offset_before_lex_functions = self.buffer.len(); let buffer_offset_before_lex_functions = self.buffer.len();
@ -138,7 +119,7 @@ impl Generator {
swap(&mut main_lex_table, &mut self.main_lex_table); swap(&mut main_lex_table, &mut self.main_lex_table);
self.add_lex_function("ts_lex", main_lex_table); self.add_lex_function("ts_lex", main_lex_table);
if self.syntax_grammar.word_token.is_some() { if self.keyword_capture_token.is_some() {
let mut keyword_lex_table = LexTable::default(); let mut keyword_lex_table = LexTable::default();
swap(&mut keyword_lex_table, &mut self.keyword_lex_table); swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
self.add_lex_function("ts_lex_keywords", keyword_lex_table); self.add_lex_function("ts_lex_keywords", keyword_lex_table);
@ -154,13 +135,7 @@ impl Generator {
} }
self.buffer.push_str(&lex_functions); self.buffer.push_str(&lex_functions);
self.add_lex_modes(); self.add_lex_modes_list();
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS && self.reserved_word_sets.len() > 1
{
self.add_reserved_word_sets();
}
self.add_parse_table(); self.add_parse_table();
if !self.syntax_grammar.external_tokens.is_empty() { if !self.syntax_grammar.external_tokens.is_empty() {
@ -241,24 +216,33 @@ impl Generator {
for alias in &production_info.alias_sequence { for alias in &production_info.alias_sequence {
// Generate a mapping from aliases to C identifiers. // Generate a mapping from aliases to C identifiers.
if let Some(alias) = &alias { if let Some(alias) = &alias {
// Some aliases match an existing symbol in the grammar. let existing_symbol = self.parse_table.symbols.iter().copied().find(|symbol| {
let alias_id = self.default_aliases.get(symbol).map_or_else(
if let Some(existing_symbol) = self.symbols_for_alias(alias).first() { || {
self.symbol_ids[&self.symbol_map[existing_symbol]].clone() let (name, kind) = self.metadata_for_symbol(*symbol);
} name == alias.value && kind == alias.kind()
// Other aliases don't match any existing symbol, and need their own },
// identifiers. |default_alias| default_alias == alias,
else { )
if let Err(i) = self.unique_aliases.binary_search(alias) { });
self.unique_aliases.insert(i, alias.clone());
}
if alias.is_named { // Some aliases match an existing symbol in the grammar.
format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) let alias_id = if let Some(existing_symbol) = existing_symbol {
} else { self.symbol_ids[&self.symbol_map[&existing_symbol]].clone()
format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) }
} // Other aliases don't match any existing symbol, and need their own
}; // identifiers.
else {
if let Err(i) = self.unique_aliases.binary_search(alias) {
self.unique_aliases.insert(i, alias.clone());
}
if alias.is_named {
format!("alias_sym_{}", self.sanitize_identifier(&alias.value))
} else {
format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value))
}
};
self.alias_ids.entry(alias.clone()).or_insert(alias_id); self.alias_ids.entry(alias.clone()).or_insert(alias_id);
} }
@ -282,34 +266,6 @@ impl Generator {
}); });
} }
// Assign an id to each unique reserved word set
self.reserved_word_sets.push(TokenSet::new());
for state in &self.parse_table.states {
let id = if let Some(ix) = self
.reserved_word_sets
.iter()
.position(|set| *set == state.reserved_words)
{
ix
} else {
self.reserved_word_sets.push(state.reserved_words.clone());
self.reserved_word_sets.len() - 1
};
self.reserved_word_set_ids_by_parse_state.push(id);
}
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
for (supertype, subtypes) in &self.supertype_symbol_map {
if let Some(supertype) = self.symbol_ids.get(supertype) {
self.supertype_map
.entry(supertype.clone())
.or_insert_with(|| subtypes.clone());
}
}
self.supertype_symbol_map.clear();
}
// Determine which states should use the "small state" representation, and which should // Determine which states should use the "small state" representation, and which should
// use the normal array representation. // use the normal array representation.
let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2); let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2);
@ -324,11 +280,6 @@ impl Generator {
.count(); .count();
} }
fn add_header(&mut self) {
add_line!(self, "/* Automatically @generated by tree-sitter */",);
add_line!(self, "");
}
fn add_includes(&mut self) { fn add_includes(&mut self) {
add_line!(self, "#include \"tree_sitter/parser.h\""); add_line!(self, "#include \"tree_sitter/parser.h\"");
add_line!(self, ""); add_line!(self, "");
@ -390,7 +341,7 @@ impl Generator {
self.parse_table.symbols.len() self.parse_table.symbols.len()
); );
add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len());
add_line!(self, "#define TOKEN_COUNT {token_count}"); add_line!(self, "#define TOKEN_COUNT {}", token_count);
add_line!( add_line!(
self, self,
"#define EXTERNAL_TOKEN_COUNT {}", "#define EXTERNAL_TOKEN_COUNT {}",
@ -402,22 +353,11 @@ impl Generator {
"#define MAX_ALIAS_SEQUENCE_LENGTH {}", "#define MAX_ALIAS_SEQUENCE_LENGTH {}",
self.parse_table.max_aliased_production_length self.parse_table.max_aliased_production_length
); );
add_line!(
self,
"#define MAX_RESERVED_WORD_SET_SIZE {}",
self.reserved_word_sets
.iter()
.map(TokenSet::len)
.max()
.unwrap()
);
add_line!( add_line!(
self, self,
"#define PRODUCTION_ID_COUNT {}", "#define PRODUCTION_ID_COUNT {}",
self.parse_table.production_infos.len() self.parse_table.production_infos.len()
); );
add_line!(self, "#define SUPERTYPE_COUNT {}", self.supertype_map.len());
add_line!(self, ""); add_line!(self, "");
} }
@ -679,32 +619,31 @@ impl Generator {
&mut next_flat_field_map_index, &mut next_flat_field_map_index,
); );
let mut field_map_ids = Vec::with_capacity(self.parse_table.production_infos.len()); let mut field_map_ids = Vec::new();
for production_info in &self.parse_table.production_infos { for production_info in &self.parse_table.production_infos {
if production_info.field_map.is_empty() { if production_info.field_map.is_empty() {
field_map_ids.push((0, 0)); field_map_ids.push((0, 0));
} else { } else {
let mut flat_field_map = Vec::with_capacity(production_info.field_map.len()); let mut flat_field_map = Vec::new();
for (field_name, locations) in &production_info.field_map { for (field_name, locations) in &production_info.field_map {
for location in locations { for location in locations {
flat_field_map.push((field_name.clone(), *location)); flat_field_map.push((field_name.clone(), *location));
} }
} }
let field_map_len = flat_field_map.len();
field_map_ids.push(( field_map_ids.push((
self.get_field_map_id( self.get_field_map_id(
flat_field_map, flat_field_map.clone(),
&mut flat_field_maps, &mut flat_field_maps,
&mut next_flat_field_map_index, &mut next_flat_field_map_index,
), ),
field_map_len, flat_field_map.len(),
)); ));
} }
} }
add_line!( add_line!(
self, self,
"static const TSMapSlice ts_field_map_slices[PRODUCTION_ID_COUNT] = {{", "static const TSFieldMapSlice ts_field_map_slices[PRODUCTION_ID_COUNT] = {{",
); );
indent!(self); indent!(self);
for (production_id, (row_id, length)) in field_map_ids.into_iter().enumerate() { for (production_id, (row_id, length)) in field_map_ids.into_iter().enumerate() {
@ -743,83 +682,6 @@ impl Generator {
add_line!(self, ""); add_line!(self, "");
} }
fn add_supertype_map(&mut self) {
add_line!(
self,
"static const TSSymbol ts_supertype_symbols[SUPERTYPE_COUNT] = {{"
);
indent!(self);
for supertype in self.supertype_map.keys() {
add_line!(self, "{supertype},");
}
dedent!(self);
add_line!(self, "}};\n");
add_line!(
self,
"static const TSMapSlice ts_supertype_map_slices[] = {{",
);
indent!(self);
let mut row_id = 0;
let mut supertype_ids = vec![0];
let mut supertype_string_map = BTreeMap::new();
for (supertype, subtypes) in &self.supertype_map {
supertype_string_map.insert(
supertype,
subtypes
.iter()
.flat_map(|s| match s {
ChildType::Normal(symbol) => vec![self.symbol_ids.get(symbol).cloned()],
ChildType::Aliased(alias) => {
self.alias_ids.get(alias).cloned().map_or_else(
|| {
self.symbols_for_alias(alias)
.into_iter()
.map(|s| self.symbol_ids.get(&s).cloned())
.collect()
},
|a| vec![Some(a)],
)
}
})
.flatten()
.collect::<BTreeSet<String>>(),
);
}
for (supertype, subtypes) in &supertype_string_map {
let length = subtypes.len();
add_line!(
self,
"[{supertype}] = {{.index = {row_id}, .length = {length}}},",
);
row_id += length;
supertype_ids.push(row_id);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
add_line!(
self,
"static const TSSymbol ts_supertype_map_entries[] = {{",
);
indent!(self);
for (i, (_, subtypes)) in supertype_string_map.iter().enumerate() {
let row_index = supertype_ids[i];
add_line!(self, "[{row_index}] =");
indent!(self);
for subtype in subtypes {
add_whitespace!(self);
add!(self, "{subtype},\n");
}
dedent!(self);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { fn add_lex_function(&mut self, name: &str, lex_table: LexTable) {
add_line!( add_line!(
self, self,
@ -877,7 +739,7 @@ impl Generator {
&& chars.ranges().all(|r| { && chars.ranges().all(|r| {
let start = *r.start() as u32; let start = *r.start() as u32;
let end = *r.end() as u32; let end = *r.end() as u32;
end <= start + 1 && u16::try_from(end).is_ok() end <= start + 1 && end <= u16::MAX as u32
}) })
{ {
leading_simple_transition_count += 1; leading_simple_transition_count += 1;
@ -965,7 +827,10 @@ impl Generator {
large_char_set_ix = Some(char_set_ix); large_char_set_ix = Some(char_set_ix);
} }
let line_break = format!("\n{}", " ".repeat(self.indent_level + 2)); let mut line_break = "\n".to_string();
for _ in 0..self.indent_level + 2 {
line_break.push_str(" ");
}
let has_positive_condition = large_char_set_ix.is_some() || !asserted_chars.is_empty(); let has_positive_condition = large_char_set_ix.is_some() || !asserted_chars.is_empty();
let has_negative_condition = !negated_chars.is_empty(); let has_negative_condition = !negated_chars.is_empty();
@ -992,7 +857,7 @@ impl Generator {
add!( add!(
self, self,
"set_contains({}, {}, lookahead)", "set_contains({}, {}, lookahead)",
char_set_info.constant_name, &char_set_info.constant_name,
large_set.range_count(), large_set.range_count(),
); );
if check_eof { if check_eof {
@ -1057,6 +922,7 @@ impl Generator {
} }
self.add_character(end); self.add_character(end);
add!(self, ")"); add!(self, ")");
continue;
} else if end == start { } else if end == start {
add!(self, "lookahead == "); add!(self, "lookahead == ");
self.add_character(start); self.add_character(start);
@ -1107,7 +973,7 @@ impl Generator {
add_line!( add_line!(
self, self,
"static const TSCharacterRange {}[] = {{", "static TSCharacterRange {}[] = {{",
info.constant_name info.constant_name
); );
@ -1142,66 +1008,25 @@ impl Generator {
} }
} }
fn add_lex_modes(&mut self) { fn add_lex_modes_list(&mut self) {
add_line!( add_line!(
self, self,
"static const {} ts_lex_modes[STATE_COUNT] = {{", "static const TSLexMode ts_lex_modes[STATE_COUNT] = {{"
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
"TSLexerMode"
} else {
"TSLexMode"
}
); );
indent!(self); indent!(self);
for (i, state) in self.parse_table.states.iter().enumerate() { for (i, state) in self.parse_table.states.iter().enumerate() {
add_whitespace!(self);
add!(self, "[{i}] = {{");
if state.is_end_of_non_terminal_extra() { if state.is_end_of_non_terminal_extra() {
add!(self, "(TSStateId)(-1),"); add_line!(self, "[{i}] = {{(TSStateId)(-1)}},");
} else if state.external_lex_state_id > 0 {
add_line!(
self,
"[{i}] = {{.lex_state = {}, .external_lex_state = {}}},",
state.lex_state_id,
state.external_lex_state_id
);
} else { } else {
add!(self, ".lex_state = {}", state.lex_state_id); add_line!(self, "[{i}] = {{.lex_state = {}}},", state.lex_state_id);
if state.external_lex_state_id > 0 {
add!(
self,
", .external_lex_state = {}",
state.external_lex_state_id
);
}
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
let reserved_word_set_id = self.reserved_word_set_ids_by_parse_state[i];
if reserved_word_set_id != 0 {
add!(self, ", .reserved_word_set_id = {reserved_word_set_id}");
}
}
} }
add!(self, "}},\n");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_reserved_word_sets(&mut self) {
add_line!(
self,
"static const TSSymbol ts_reserved_words[{}][MAX_RESERVED_WORD_SET_SIZE] = {{",
self.reserved_word_sets.len(),
);
indent!(self);
for (id, set) in self.reserved_word_sets.iter().enumerate() {
if id == 0 {
continue;
}
add_line!(self, "[{id}] = {{");
indent!(self);
for token in set.iter() {
add_line!(self, "{},", self.symbol_ids[&token]);
}
dedent!(self);
add_line!(self, "}},");
} }
dedent!(self); dedent!(self);
add_line!(self, "}};"); add_line!(self, "}};");
@ -1255,7 +1080,7 @@ impl Generator {
indent!(self); indent!(self);
for i in 0..self.parse_table.external_lex_states.len() { for i in 0..self.parse_table.external_lex_states.len() {
if !self.parse_table.external_lex_states[i].is_empty() { if !self.parse_table.external_lex_states[i].is_empty() {
add_line!(self, "[{i}] = {{"); add_line!(self, "[{}] = {{", i);
indent!(self); indent!(self);
for token in self.parse_table.external_lex_states[i].iter() { for token in self.parse_table.external_lex_states[i].iter() {
add_line!( add_line!(
@ -1277,7 +1102,6 @@ impl Generator {
let mut parse_table_entries = HashMap::new(); let mut parse_table_entries = HashMap::new();
let mut next_parse_action_list_index = 0; let mut next_parse_action_list_index = 0;
// Parse action lists zero is for the default value, when a symbol is not valid.
self.get_parse_action_list_id( self.get_parse_action_list_id(
&ParseTableEntry { &ParseTableEntry {
actions: Vec::new(), actions: Vec::new(),
@ -1303,7 +1127,7 @@ impl Generator {
.enumerate() .enumerate()
.take(self.large_state_count) .take(self.large_state_count)
{ {
add_line!(self, "[STATE({i})] = {{"); add_line!(self, "[{i}] = {{");
indent!(self); indent!(self);
// Ensure the entries are in a deterministic order, since they are // Ensure the entries are in a deterministic order, since they are
@ -1335,11 +1159,9 @@ impl Generator {
); );
add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]); add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]);
} }
dedent!(self); dedent!(self);
add_line!(self, "}},"); add_line!(self, "}},");
} }
dedent!(self); dedent!(self);
add_line!(self, "}};"); add_line!(self, "}};");
add_line!(self, ""); add_line!(self, "");
@ -1348,16 +1170,11 @@ impl Generator {
add_line!(self, "static const uint16_t ts_small_parse_table[] = {{"); add_line!(self, "static const uint16_t ts_small_parse_table[] = {{");
indent!(self); indent!(self);
let mut next_table_index = 0; let mut index = 0;
let mut small_state_indices = Vec::with_capacity( let mut small_state_indices = Vec::new();
self.parse_table
.states
.len()
.saturating_sub(self.large_state_count),
);
let mut symbols_by_value = HashMap::<(usize, SymbolType), Vec<Symbol>>::new(); let mut symbols_by_value = HashMap::<(usize, SymbolType), Vec<Symbol>>::new();
for state in self.parse_table.states.iter().skip(self.large_state_count) { for state in self.parse_table.states.iter().skip(self.large_state_count) {
small_state_indices.push(next_table_index); small_state_indices.push(index);
symbols_by_value.clear(); symbols_by_value.clear();
terminal_entries.clear(); terminal_entries.clear();
@ -1396,16 +1213,10 @@ impl Generator {
(symbols.len(), *kind, *value, symbols[0]) (symbols.len(), *kind, *value, symbols[0])
}); });
add_line!( add_line!(self, "[{index}] = {},", values_with_symbols.len());
self,
"[{next_table_index}] = {},",
values_with_symbols.len()
);
indent!(self); indent!(self);
next_table_index += 1;
for ((value, kind), symbols) in &mut values_with_symbols { for ((value, kind), symbols) in &mut values_with_symbols {
next_table_index += 2 + symbols.len();
if *kind == SymbolType::NonTerminal { if *kind == SymbolType::NonTerminal {
add_line!(self, "STATE({value}), {},", symbols.len()); add_line!(self, "STATE({value}), {},", symbols.len());
} else { } else {
@ -1421,6 +1232,11 @@ impl Generator {
} }
dedent!(self); dedent!(self);
index += 1 + values_with_symbols
.iter()
.map(|(_, symbols)| 2 + symbols.len())
.sum::<usize>();
} }
dedent!(self); dedent!(self);
@ -1549,7 +1365,7 @@ impl Generator {
indent!(self); indent!(self);
add_line!(self, "static const TSLanguage language = {{"); add_line!(self, "static const TSLanguage language = {{");
indent!(self); indent!(self);
add_line!(self, ".abi_version = LANGUAGE_VERSION,"); add_line!(self, ".version = LANGUAGE_VERSION,");
// Quantities // Quantities
add_line!(self, ".symbol_count = SYMBOL_COUNT,"); add_line!(self, ".symbol_count = SYMBOL_COUNT,");
@ -1559,9 +1375,6 @@ impl Generator {
add_line!(self, ".state_count = STATE_COUNT,"); add_line!(self, ".state_count = STATE_COUNT,");
add_line!(self, ".large_state_count = LARGE_STATE_COUNT,"); add_line!(self, ".large_state_count = LARGE_STATE_COUNT,");
add_line!(self, ".production_id_count = PRODUCTION_ID_COUNT,"); add_line!(self, ".production_id_count = PRODUCTION_ID_COUNT,");
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
add_line!(self, ".supertype_count = SUPERTYPE_COUNT,");
}
add_line!(self, ".field_count = FIELD_COUNT,"); add_line!(self, ".field_count = FIELD_COUNT,");
add_line!( add_line!(
self, self,
@ -1583,11 +1396,6 @@ impl Generator {
add_line!(self, ".field_map_slices = ts_field_map_slices,"); add_line!(self, ".field_map_slices = ts_field_map_slices,");
add_line!(self, ".field_map_entries = ts_field_map_entries,"); add_line!(self, ".field_map_entries = ts_field_map_entries,");
} }
if !self.supertype_map.is_empty() && self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
add_line!(self, ".supertype_map_slices = ts_supertype_map_slices,");
add_line!(self, ".supertype_map_entries = ts_supertype_map_entries,");
add_line!(self, ".supertype_symbols = ts_supertype_symbols,");
}
add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); add_line!(self, ".symbol_metadata = ts_symbol_metadata,");
add_line!(self, ".public_symbol_map = ts_symbol_map,"); add_line!(self, ".public_symbol_map = ts_symbol_map,");
add_line!(self, ".alias_map = ts_non_terminal_alias_map,"); add_line!(self, ".alias_map = ts_non_terminal_alias_map,");
@ -1596,9 +1404,9 @@ impl Generator {
} }
// Lexing // Lexing
add_line!(self, ".lex_modes = (const void*)ts_lex_modes,"); add_line!(self, ".lex_modes = ts_lex_modes,");
add_line!(self, ".lex_fn = ts_lex,"); add_line!(self, ".lex_fn = ts_lex,");
if let Some(keyword_capture_token) = self.syntax_grammar.word_token { if let Some(keyword_capture_token) = self.keyword_capture_token {
add_line!(self, ".keyword_lex_fn = ts_lex_keywords,"); add_line!(self, ".keyword_lex_fn = ts_lex_keywords,");
add_line!( add_line!(
self, self,
@ -1621,42 +1429,8 @@ impl Generator {
add_line!(self, "}},"); add_line!(self, "}},");
} }
add_line!(self, ".primary_state_ids = ts_primary_state_ids,"); if self.abi_version >= ABI_VERSION_WITH_PRIMARY_STATES {
add_line!(self, ".primary_state_ids = ts_primary_state_ids,");
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
add_line!(self, ".name = \"{}\",", self.language_name);
if self.reserved_word_sets.len() > 1 {
add_line!(self, ".reserved_words = &ts_reserved_words[0][0],");
}
add_line!(
self,
".max_reserved_word_set_size = {},",
self.reserved_word_sets
.iter()
.map(TokenSet::len)
.max()
.unwrap()
);
let Some(metadata) = &self.metadata else {
panic!(
indoc! {"
Metadata is required to generate ABI version {}.
This means that your grammar doesn't have a tree-sitter.json config file with an appropriate version field in the metadata table.
"},
self.abi_version
);
};
add_line!(self, ".metadata = {{");
indent!(self);
add_line!(self, ".major_version = {},", metadata.major_version);
add_line!(self, ".minor_version = {},", metadata.minor_version);
add_line!(self, ".patch_version = {},", metadata.patch_version);
dedent!(self);
add_line!(self, "}},");
} }
dedent!(self); dedent!(self);
@ -1758,23 +1532,6 @@ impl Generator {
} }
} }
fn symbols_for_alias(&self, alias: &Alias) -> Vec<Symbol> {
self.parse_table
.symbols
.iter()
.copied()
.filter(move |symbol| {
self.default_aliases.get(symbol).map_or_else(
|| {
let (name, kind) = self.metadata_for_symbol(*symbol);
name == alias.value && kind == alias.kind()
},
|default_alias| default_alias == alias,
)
})
.collect()
}
fn sanitize_identifier(&self, name: &str) -> String { fn sanitize_identifier(&self, name: &str) -> String {
let mut result = String::with_capacity(name.len()); let mut result = String::with_capacity(name.len());
for c in name.chars() { for c in name.chars() {
@ -1850,11 +1607,11 @@ impl Generator {
'\u{007F}' => "DEL", '\u{007F}' => "DEL",
'\u{FEFF}' => "BOM", '\u{FEFF}' => "BOM",
'\u{0080}'..='\u{FFFF}' => { '\u{0080}'..='\u{FFFF}' => {
write!(result, "u{:04x}", c as u32).unwrap(); result.push_str(&format!("u{:04x}", c as u32));
break 'special_chars; break 'special_chars;
} }
'\u{10000}'..='\u{10FFFF}' => { '\u{10000}'..='\u{10FFFF}' => {
write!(result, "U{:08x}", c as u32).unwrap(); result.push_str(&format!("U{:08x}", c as u32));
break 'special_chars; break 'special_chars;
} }
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
@ -1885,9 +1642,11 @@ impl Generator {
'\r' => result += "\\r", '\r' => result += "\\r",
'\t' => result += "\\t", '\t' => result += "\\t",
'\0' => result += "\\0", '\0' => result += "\\0",
'\u{0001}'..='\u{001f}' => write!(result, "\\x{:02x}", c as u32).unwrap(), '\u{0001}'..='\u{001f}' => result += &format!("\\x{:02x}", c as u32),
'\u{007F}'..='\u{FFFF}' => write!(result, "\\u{:04x}", c as u32).unwrap(), '\u{007F}'..='\u{FFFF}' => result += &format!("\\u{:04x}", c as u32),
'\u{10000}'..='\u{10FFFF}' => write!(result, "\\U{:08x}", c as u32).unwrap(), '\u{10000}'..='\u{10FFFF}' => {
result.push_str(&format!("\\U{:08x}", c as u32));
}
_ => result.push(c), _ => result.push(c),
} }
} }
@ -1940,8 +1699,6 @@ pub fn render_c_code(
lexical_grammar: LexicalGrammar, lexical_grammar: LexicalGrammar,
default_aliases: AliasMap, default_aliases: AliasMap,
abi_version: usize, abi_version: usize,
semantic_version: Option<(u8, u8, u8)>,
supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
) -> String { ) -> String {
assert!( assert!(
(ABI_VERSION_MIN..=ABI_VERSION_MAX).contains(&abi_version), (ABI_VERSION_MIN..=ABI_VERSION_MAX).contains(&abi_version),
@ -1949,23 +1706,26 @@ pub fn render_c_code(
); );
Generator { Generator {
buffer: String::new(),
indent_level: 0,
language_name: name.to_string(), language_name: name.to_string(),
large_state_count: 0,
parse_table: tables.parse_table, parse_table: tables.parse_table,
main_lex_table: tables.main_lex_table, main_lex_table: tables.main_lex_table,
keyword_lex_table: tables.keyword_lex_table, keyword_lex_table: tables.keyword_lex_table,
keyword_capture_token: tables.word_token,
large_character_sets: tables.large_character_sets, large_character_sets: tables.large_character_sets,
large_character_set_info: Vec::new(), large_character_set_info: Vec::new(),
syntax_grammar, syntax_grammar,
lexical_grammar, lexical_grammar,
default_aliases, default_aliases,
symbol_ids: HashMap::new(),
symbol_order: HashMap::new(),
alias_ids: HashMap::new(),
symbol_map: HashMap::new(),
unique_aliases: Vec::new(),
field_names: Vec::new(),
abi_version, abi_version,
metadata: semantic_version.map(|(major_version, minor_version, patch_version)| Metadata {
major_version,
minor_version,
patch_version,
}),
supertype_symbol_map,
..Default::default()
} }
.generate() .generate()
} }

View file

@ -1,11 +1,10 @@
use std::{collections::BTreeMap, fmt}; use std::{collections::HashMap, fmt};
use serde::Serialize;
use smallbitvec::SmallBitVec; use smallbitvec::SmallBitVec;
use super::grammars::VariableType; use super::grammars::VariableType;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum SymbolType { pub enum SymbolType {
External, External,
End, End,
@ -14,19 +13,19 @@ pub enum SymbolType {
NonTerminal, NonTerminal,
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum Associativity { pub enum Associativity {
Left, Left,
Right, Right,
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Alias { pub struct Alias {
pub value: String, pub value: String,
pub is_named: bool, pub is_named: bool,
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
pub enum Precedence { pub enum Precedence {
#[default] #[default]
None, None,
@ -34,50 +33,48 @@ pub enum Precedence {
Name(String), Name(String),
} }
pub type AliasMap = BTreeMap<Symbol, Alias>; pub type AliasMap = HashMap<Symbol, Alias>;
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize)] #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
pub struct MetadataParams { pub struct MetadataParams {
pub precedence: Precedence, pub precedence: Precedence,
pub dynamic_precedence: i32, pub dynamic_precedence: i32,
pub associativity: Option<Associativity>, pub associativity: Option<Associativity>,
pub is_token: bool, pub is_token: bool,
pub is_string: bool,
pub is_active: bool,
pub is_main_token: bool, pub is_main_token: bool,
pub alias: Option<Alias>, pub alias: Option<Alias>,
pub field_name: Option<String>, pub field_name: Option<String>,
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Symbol { pub struct Symbol {
pub kind: SymbolType, pub kind: SymbolType,
pub index: usize, pub index: usize,
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum Rule { pub enum Rule {
Blank, Blank,
String(String), String(String),
Pattern(String, String), Pattern(String, String),
NamedSymbol(String), NamedSymbol(String),
Symbol(Symbol), Symbol(Symbol),
Choice(Vec<Self>), Choice(Vec<Rule>),
Metadata { Metadata {
params: MetadataParams, params: MetadataParams,
rule: Box<Self>, rule: Box<Rule>,
},
Repeat(Box<Self>),
Seq(Vec<Self>),
Reserved {
rule: Box<Self>,
context_name: String,
}, },
Repeat(Box<Rule>),
Seq(Vec<Rule>),
} }
// Because tokens are represented as small (~400 max) unsigned integers, // Because tokens are represented as small (~400 max) unsigned integers,
// sets of tokens can be efficiently represented as bit vectors with each // sets of tokens can be efficiently represented as bit vectors with each
// index corresponding to a token, and each value representing whether or not // index corresponding to a token, and each value representing whether or not
// the token is present in the set. // the token is present in the set.
#[derive(Default, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenSet { pub struct TokenSet {
terminal_bits: SmallBitVec, terminal_bits: SmallBitVec,
external_bits: SmallBitVec, external_bits: SmallBitVec,
@ -85,32 +82,6 @@ pub struct TokenSet {
end_of_nonterminal_extra: bool, end_of_nonterminal_extra: bool,
} }
impl fmt::Debug for TokenSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_list().entries(self.iter()).finish()
}
}
impl PartialOrd for TokenSet {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for TokenSet {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.terminal_bits
.iter()
.cmp(other.terminal_bits.iter())
.then_with(|| self.external_bits.iter().cmp(other.external_bits.iter()))
.then_with(|| self.eof.cmp(&other.eof))
.then_with(|| {
self.end_of_nonterminal_extra
.cmp(&other.end_of_nonterminal_extra)
})
}
}
impl Rule { impl Rule {
pub fn field(name: String, content: Self) -> Self { pub fn field(name: String, content: Self) -> Self {
add_metadata(content, move |params| { add_metadata(content, move |params| {
@ -178,18 +149,6 @@ impl Rule {
pub const fn seq(rules: Vec<Self>) -> Self { pub const fn seq(rules: Vec<Self>) -> Self {
Self::Seq(rules) Self::Seq(rules)
} }
pub fn is_empty(&self) -> bool {
match self {
Self::Blank | Self::Pattern(..) | Self::NamedSymbol(_) | Self::Symbol(_) => false,
Self::String(string) => string.is_empty(),
Self::Metadata { rule, .. } | Self::Repeat(rule) | Self::Reserved { rule, .. } => {
rule.is_empty()
}
Self::Choice(rules) => rules.iter().any(Self::is_empty),
Self::Seq(rules) => rules.iter().all(Self::is_empty),
}
}
} }
impl Alias { impl Alias {
@ -306,13 +265,13 @@ impl Symbol {
} }
impl From<Symbol> for Rule { impl From<Symbol> for Rule {
#[must_use]
fn from(symbol: Symbol) -> Self { fn from(symbol: Symbol) -> Self {
Self::Symbol(symbol) Self::Symbol(symbol)
} }
} }
impl TokenSet { impl TokenSet {
#[must_use]
pub const fn new() -> Self { pub const fn new() -> Self {
Self { Self {
terminal_bits: SmallBitVec::new(), terminal_bits: SmallBitVec::new(),
@ -424,9 +383,6 @@ impl TokenSet {
}; };
if other.index < vec.len() && vec[other.index] { if other.index < vec.len() && vec[other.index] {
vec.set(other.index, false); vec.set(other.index, false);
while vec.last() == Some(false) {
vec.pop();
}
return true; return true;
} }
false false
@ -439,13 +395,6 @@ impl TokenSet {
&& !self.external_bits.iter().any(|a| a) && !self.external_bits.iter().any(|a| a)
} }
pub fn len(&self) -> usize {
self.eof as usize
+ self.end_of_nonterminal_extra as usize
+ self.terminal_bits.iter().filter(|b| *b).count()
+ self.external_bits.iter().filter(|b| *b).count()
}
pub fn insert_all_terminals(&mut self, other: &Self) -> bool { pub fn insert_all_terminals(&mut self, other: &Self) -> bool {
let mut result = false; let mut result = false;
if other.terminal_bits.len() > self.terminal_bits.len() { if other.terminal_bits.len() > self.terminal_bits.len() {

View file

@ -47,7 +47,6 @@ pub struct ParseState {
pub id: ParseStateId, pub id: ParseStateId,
pub terminal_entries: IndexMap<Symbol, ParseTableEntry, BuildHasherDefault<FxHasher>>, pub terminal_entries: IndexMap<Symbol, ParseTableEntry, BuildHasherDefault<FxHasher>>,
pub nonterminal_entries: IndexMap<Symbol, GotoAction, BuildHasherDefault<FxHasher>>, pub nonterminal_entries: IndexMap<Symbol, GotoAction, BuildHasherDefault<FxHasher>>,
pub reserved_words: TokenSet,
pub lex_state_id: usize, pub lex_state_id: usize,
pub external_lex_state_id: usize, pub external_lex_state_id: usize,
pub core_id: usize, pub core_id: usize,
@ -65,7 +64,7 @@ pub struct ProductionInfo {
pub field_map: BTreeMap<String, Vec<FieldLocation>>, pub field_map: BTreeMap<String, Vec<FieldLocation>>,
} }
#[derive(Debug, Default, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct ParseTable { pub struct ParseTable {
pub states: Vec<ParseState>, pub states: Vec<ParseState>,
pub symbols: Vec<Symbol>, pub symbols: Vec<Symbol>,
@ -93,7 +92,6 @@ pub struct LexTable {
} }
impl ParseTableEntry { impl ParseTableEntry {
#[must_use]
pub const fn new() -> Self { pub const fn new() -> Self {
Self { Self {
reusable: true, reusable: true,

View file

@ -3,15 +3,11 @@ root = true
[*] [*]
charset = utf-8 charset = utf-8
[*.{json,toml,yml,gyp,xml}] [*.{json,toml,yml,gyp}]
indent_style = space indent_style = space
indent_size = 2 indent_size = 2
[*.{js,ts}] [*.js]
indent_style = space
indent_size = 2
[*.scm]
indent_style = space indent_style = space
indent_size = 2 indent_size = 2
@ -31,10 +27,6 @@ indent_size = 4
indent_style = space indent_style = space
indent_size = 4 indent_size = 4
[*.java]
indent_style = space
indent_size = 4
[*.go] [*.go]
indent_style = tab indent_style = tab
indent_size = 8 indent_size = 8
@ -45,6 +37,3 @@ indent_size = 8
[parser.c] [parser.c]
indent_size = 2 indent_size = 2
[{alloc,array,parser}.h]
indent_size = 2

View file

@ -0,0 +1,11 @@
prefix=@PREFIX@
libdir=@LIBDIR@
includedir=@INCLUDEDIR@
Name: tree-sitter-PARSER_NAME
Description: CAMEL_PARSER_NAME grammar for tree-sitter
URL: @URL@
Version: @VERSION@
Requires: @REQUIRES@
Libs: -L${libdir} @ADDITIONAL_LIBS@ -ltree-sitter-PARSER_NAME
Cflags: -I${includedir}

Some files were not shown because too many files have changed in this diff Show more