Compare commits

..

2 commits

583 changed files with 29263 additions and 58393 deletions

View file

@ -1,2 +0,0 @@
[alias]
xtask = "run --package xtask --"

View file

@ -1,2 +0,0 @@
target
.git

View file

@ -10,9 +10,6 @@ insert_final_newline = true
[*.rs] [*.rs]
indent_size = 4 indent_size = 4
[*.{zig,zon}]
indent_size = 4
[Makefile] [Makefile]
indent_style = tab indent_style = tab
indent_size = 8 indent_size = 8

1
.envrc
View file

@ -1 +0,0 @@
use flake

1
.gitattributes vendored
View file

@ -3,4 +3,5 @@
/lib/src/unicode/*.h linguist-vendored /lib/src/unicode/*.h linguist-vendored
/lib/src/unicode/LICENSE linguist-vendored /lib/src/unicode/LICENSE linguist-vendored
/cli/src/generate/prepare_grammar/*.json -diff
Cargo.lock -diff Cargo.lock -diff

15
.github/FUNDING.yml vendored
View file

@ -1,15 +0,0 @@
# These are supported funding model platforms
github: tree-sitter
patreon: # Replace with a single Patreon username
open_collective: tree-sitter # Replace with a single Open Collective username
ko_fi: amaanq
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
polar: # Replace with a single Polar username
buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
thanks_dev: # Replace with a single thanks.dev username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

View file

@ -1,6 +1,6 @@
name: Bug Report name: Bug Report
description: Report a problem description: Report a problem
type: Bug labels: [bug]
body: body:
- type: textarea - type: textarea
attributes: attributes:
@ -13,11 +13,9 @@ body:
attributes: attributes:
label: "Steps to reproduce" label: "Steps to reproduce"
placeholder: | placeholder: |
```sh
git clone --depth=1 https://github.com/tree-sitter/tree-sitter-ruby git clone --depth=1 https://github.com/tree-sitter/tree-sitter-ruby
cd tree-sitter-ruby cd tree-sitter-ruby
tree-sitter generate tree-sitter generate
```
validations: validations:
required: true required: true

View file

@ -1,6 +1,6 @@
name: Feature request name: Feature request
description: Request an enhancement description: Request an enhancement
type: Feature labels: [enhancement]
body: body:
- type: markdown - type: markdown
attributes: attributes:

View file

@ -1,25 +0,0 @@
name: Cache
description: This action caches fixtures
outputs:
cache-hit:
description: Cache hit
value: ${{ steps.cache.outputs.cache-hit }}
runs:
using: composite
steps:
- uses: actions/cache@v4
id: cache
with:
path: |
test/fixtures/grammars
target/release/tree-sitter-*.wasm
key: fixtures-${{ join(matrix.*, '_') }}-${{ hashFiles(
'crates/generate/src/**',
'lib/src/parser.h',
'lib/src/array.h',
'lib/src/alloc.h',
'test/fixtures/grammars/*/**/src/*.c',
'.github/actions/cache/action.yml') }}

72
.github/cliff.toml vendored
View file

@ -1,72 +0,0 @@
[changelog]
# changelog header
header = """
# Changelog\n
"""
# template for the changelog body
# https://tera.netlify.app/docs/#introduction
body = """
{% if version %}\
## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }}
{% else %}\
## [unreleased]
{% endif %}\
{% for group, commits in commits | group_by(attribute="group") %}
### {{ group | striptags | upper_first }}
{% for commit in commits%}\
{% if not commit.scope %}\
- {{ commit.message | upper_first }}\
{% if commit.remote.pr_number %} (<https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}/pull/{{ commit.remote.pr_number }}>){%- endif %}
{% endif %}\
{% endfor %}\
{% for group, commits in commits | group_by(attribute="scope") %}\
{% for commit in commits %}\
- **{{commit.scope}}**: {{ commit.message | upper_first }}\
{% if commit.remote.pr_number %} (<https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}/pull/{{ commit.remote.pr_number }}>){%- endif %}
{% endfor %}\
{% endfor %}
{% endfor %}
"""
# remove the leading and trailing whitespace from the template
trim = true
[git]
# parse the commits based on https://www.conventionalcommits.org
conventional_commits = true
# filter out the commits that are not conventional
filter_unconventional = false
# process each line of a commit as an individual commit
split_commits = false
# regex for preprocessing the commit messages
commit_preprocessors = [
# { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](https://github.com/neovim/neovim/issues/${2}))"},
]
# regex for parsing and grouping commits
commit_parsers = [
{ message = "!:", group = "<!-- 0 -->Breaking" },
{ message = "^feat", group = "<!-- 1 -->Features" },
{ message = "^fix", group = "<!-- 2 -->Bug Fixes" },
{ message = "^perf", group = "<!-- 3 -->Performance" },
{ message = "^doc", group = "<!-- 4 -->Documentation" },
{ message = "^refactor", group = "<!-- 5 -->Refactor" },
{ message = "^test", group = "<!-- 6 -->Testing" },
{ message = "^build", group = "<!-- 7 -->Build System and CI" },
{ message = "^ci", group = "<!-- 7 -->Build System and CI" },
{ message = ".*", group = "<!-- 8 -->Other" },
]
# filter out the commits that are not matched by commit parsers
filter_commits = false
# glob pattern for matching git tags
tag_pattern = "v[0-9]*"
# regex for skipping tags
skip_tags = "v0.1.0-beta.1"
# regex for ignoring tags
ignore_tags = ""
# sort the tags chronologically
date_order = false
# sort the commits inside sections by oldest/newest order
sort_commits = "oldest"
[remote.github]
owner = "tree-sitter"
repo = "tree-sitter"

View file

@ -4,50 +4,15 @@ updates:
directory: "/" directory: "/"
schedule: schedule:
interval: "weekly" interval: "weekly"
cooldown:
default-days: 3
commit-message: commit-message:
prefix: "build(deps)" prefix: "build(deps)"
labels:
- "dependencies"
- "cargo"
groups:
cargo:
patterns: ["*"]
ignore: ignore:
- dependency-name: "*" - dependency-name: "*"
update-types: ["version-update:semver-major", "version-update:semver-minor"] update-types: ["version-update:semver-patch"]
- package-ecosystem: "github-actions" - package-ecosystem: "github-actions"
directory: "/" directory: "/"
schedule: schedule:
interval: "weekly" interval: "weekly"
cooldown:
default-days: 3
commit-message: commit-message:
prefix: "ci" prefix: "ci"
labels:
- "dependencies"
- "github-actions"
groups:
actions:
patterns: ["*"]
- package-ecosystem: "npm"
versioning-strategy: increase
directories:
- "/crates/npm"
- "/crates/eslint"
- "/lib/binding_web"
schedule:
interval: "weekly"
cooldown:
default-days: 3
commit-message:
prefix: "build(deps)"
labels:
- "dependencies"
- "npm"
groups:
npm:
patterns: ["*"]

View file

@ -1,29 +0,0 @@
module.exports = async ({ github, context }) => {
let target = context.payload.issue;
if (target) {
await github.rest.issues.update({
...context.repo,
issue_number: target.number,
state: "closed",
state_reason: "not_planned",
title: "[spam]",
body: "",
type: null,
});
} else {
target = context.payload.pull_request;
await github.rest.pulls.update({
...context.repo,
pull_number: target.number,
state: "closed",
title: "[spam]",
body: "",
});
}
await github.rest.issues.lock({
...context.repo,
issue_number: target.number,
lock_reason: "spam",
});
};

17
.github/scripts/cross.sh vendored Executable file
View file

@ -0,0 +1,17 @@
#!/bin/bash
# set -x
set -e
if [ "$BUILD_CMD" != "cross" ]; then
echo "cross.sh - is a helper to assist only in cross compiling environments" >&2
echo "To use this tool set the BUILD_CMD env var to the \"cross\" value" >&2
exit 111
fi
if [ -z "$CROSS_IMAGE" ]; then
echo "The CROSS_IMAGE env var should be provided" >&2
exit 111
fi
docker run --rm -v /home/runner:/home/runner -w "$PWD" "$CROSS_IMAGE" "$@"

19
.github/scripts/make.sh vendored Executable file
View file

@ -0,0 +1,19 @@
#!/bin/bash
# set -x
set -e
if [ "$BUILD_CMD" == "cross" ]; then
if [ -z "$CC" ]; then
echo "make.sh: CC is not set" >&2
exit 111
fi
if [ -z "$AR" ]; then
echo "make.sh: AR is not set" >&2
exit 111
fi
cross.sh make CC=$CC AR=$AR "$@"
else
make "$@"
fi

View file

@ -1,16 +0,0 @@
module.exports = async ({ github, context }) => {
const requestedReviewers = await github.rest.pulls.listRequestedReviewers({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: context.issue.number,
});
const reviewers = requestedReviewers.data.users.map((e) => e.login);
github.rest.pulls.removeRequestedReviewers({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: context.issue.number,
reviewers: reviewers,
});
};

28
.github/scripts/tree-sitter.sh vendored Executable file
View file

@ -0,0 +1,28 @@
#!/bin/bash
# set -x
set -e
if [ -z "$ROOT" ]; then
echo "The ROOT env var should be set to absolute path of a repo root folder" >&2
exit 111
fi
if [ -z "$TARGET" ]; then
echo "The TARGET env var should be equal to a \`cargo build --target <TARGET>\` command value" >&2
exit 111
fi
tree_sitter="$ROOT"/target/"$TARGET"/release/tree-sitter
if [ "$BUILD_CMD" == "cross" ]; then
if [ -z "$CROSS_RUNNER" ]; then
echo "The CROSS_RUNNER env var should be set to a CARGO_TARGET_*_RUNNER env var value" >&2
echo "that is available in a docker image used by the cross tool under the hood" >&2
exit 111
fi
cross.sh $CROSS_RUNNER "$tree_sitter" "$@"
else
"$tree_sitter" "$@"
fi

View file

@ -1,25 +0,0 @@
module.exports = async ({ github, context, core }) => {
if (context.eventName !== 'pull_request') return;
const prNumber = context.payload.pull_request.number;
const owner = context.repo.owner;
const repo = context.repo.repo;
const { data: files } = await github.rest.pulls.listFiles({
owner,
repo,
pull_number: prNumber
});
const changedFiles = files.map(file => file.filename);
const wasmStdLibSrc = 'crates/language/wasm/';
const dirChanged = changedFiles.some(file => file.startsWith(wasmStdLibSrc));
if (!dirChanged) return;
const wasmStdLibHeader = 'lib/src/wasm/wasm-stdlib.h';
const requiredChanged = changedFiles.includes(wasmStdLibHeader);
if (!requiredChanged) core.setFailed(`Changes detected in ${wasmStdLibSrc} but ${wasmStdLibHeader} was not modified.`);
};

View file

@ -1,31 +0,0 @@
name: Backport Pull Request
on:
pull_request_target:
types: [closed, labeled]
permissions:
contents: write
pull-requests: write
jobs:
backport:
if: github.event.pull_request.merged
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Create app token
uses: actions/create-github-app-token@v2
id: app-token
with:
app-id: ${{ vars.BACKPORT_APP }}
private-key: ${{ secrets.BACKPORT_KEY }}
- name: Create backport PR
uses: korthout/backport-action@v4
with:
pull_title: "${pull_title}"
label_pattern: "^ci:backport ([^ ]+)$"
github_token: ${{ steps.app-token.outputs.token }}

View file

@ -1,30 +0,0 @@
name: Check Bindgen Output
on:
pull_request:
paths:
- lib/include/tree_sitter/api.h
- lib/binding_rust/bindings.rs
push:
branches: [master]
paths:
- lib/include/tree_sitter/api.h
- lib/binding_rust/bindings.rs
jobs:
check-bindgen:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up stable Rust toolchain
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
- name: Generate bindings
run: cargo xtask generate-bindings
- name: Check if the bindgen output changed
run: git diff --exit-code lib/binding_rust/bindings.rs

View file

@ -1,9 +1,14 @@
name: Build & Test name: Build & Test
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-D warnings"
CROSS_DEBUG: 1
on: on:
workflow_call: workflow_call:
inputs: inputs:
run-test: run_test:
default: true default: true
type: boolean type: boolean
@ -16,296 +21,180 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
platform: platform:
- linux-arm64 - linux-arm64 #
- linux-arm - linux-arm #
- linux-x64 - linux-x64 #
- linux-x86 - linux-x86 #
- linux-powerpc64 - linux-powerpc64 #
- windows-arm64 - windows-arm64 #
- windows-x64 - windows-x64 # <-- No C library build - requires an additional adapted Makefile for `cl.exe` compiler
- windows-x86 - windows-x86 # -- // --
- macos-arm64 - macos-arm64 #
- macos-x64 - macos-x64 #
- wasm32
include: include:
# When adding a new `target`: # When adding a new `target`:
# 1. Define a new platform alias above # 1. Define a new platform alias above
# 2. Add a new record to the matrix map in `crates/cli/npm/install.js` # 2. Add a new record to a matrix map in `cli/npm/install.js`
- { platform: linux-arm64 , target: aarch64-unknown-linux-gnu , os: ubuntu-24.04-arm } - { platform: linux-arm64 , target: aarch64-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { platform: linux-arm , target: armv7-unknown-linux-gnueabihf , os: ubuntu-24.04-arm } - { platform: linux-arm , target: arm-unknown-linux-gnueabi , os: ubuntu-latest , use-cross: true }
- { platform: linux-x64 , target: x86_64-unknown-linux-gnu , os: ubuntu-24.04 } - { platform: linux-x64 , target: x86_64-unknown-linux-gnu , os: ubuntu-20.04 , enable-wasm: true } #2272
- { platform: linux-x86 , target: i686-unknown-linux-gnu , os: ubuntu-24.04 } - { platform: linux-x86 , target: i686-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { platform: linux-powerpc64 , target: powerpc64-unknown-linux-gnu , os: ubuntu-24.04 } - { platform: linux-powerpc64 , target: powerpc64-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { platform: windows-arm64 , target: aarch64-pc-windows-msvc , os: windows-11-arm } - { platform: windows-arm64 , target: aarch64-pc-windows-msvc , os: windows-latest }
- { platform: windows-x64 , target: x86_64-pc-windows-msvc , os: windows-2025 } - { platform: windows-x64 , target: x86_64-pc-windows-msvc , os: windows-latest , enable-wasm: true }
- { platform: windows-x86 , target: i686-pc-windows-msvc , os: windows-2025 } - { platform: windows-x86 , target: i686-pc-windows-msvc , os: windows-latest }
- { platform: macos-arm64 , target: aarch64-apple-darwin , os: macos-15 } - { platform: macos-arm64 , target: aarch64-apple-darwin , os: macos-14 , enable-wasm: true }
- { platform: macos-x64 , target: x86_64-apple-darwin , os: macos-15-intel } - { platform: macos-x64 , target: x86_64-apple-darwin , os: macos-latest , enable-wasm: true }
- { platform: wasm32 , target: wasm32-unknown-unknown , os: ubuntu-24.04 }
# Extra features # Cross compilers for C library
- { platform: linux-arm64 , features: wasm } - { platform: linux-arm64 , cc: aarch64-linux-gnu-gcc , ar: aarch64-linux-gnu-ar }
- { platform: linux-x64 , features: wasm } - { platform: linux-arm , cc: arm-linux-gnueabi-gcc , ar: arm-linux-gnueabi-ar }
- { platform: macos-arm64 , features: wasm } - { platform: linux-x86 , cc: i686-linux-gnu-gcc , ar: i686-linux-gnu-ar }
- { platform: macos-x64 , features: wasm } - { platform: linux-powerpc64 , cc: powerpc64-linux-gnu-gcc , ar: powerpc64-linux-gnu-ar }
# Cross-compilation # See #2041 tree-sitter issue
- { platform: linux-arm , cross: true } - { platform: windows-x64 , rust-test-threads: 1 }
- { platform: linux-x86 , cross: true } - { platform: windows-x86 , rust-test-threads: 1 }
- { platform: linux-powerpc64 , cross: true }
# Compile-only # CLI only build
- { platform: wasm32 , no-run: true } - { platform: windows-arm64 , cli-only: true }
env: env:
CARGO_TERM_COLOR: always BUILD_CMD: cargo
RUSTFLAGS: -D warnings EMSCRIPTEN_VERSION: ""
EXE: ${{ contains(matrix.target, 'windows') && '.exe' || '' }}
defaults: defaults:
run: run:
shell: bash shell: bash
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set up cross-compilation - name: Read Emscripten version
if: matrix.cross
run: | run: |
for target in armv7-unknown-linux-gnueabihf i686-unknown-linux-gnu powerpc64-unknown-linux-gnu; do echo "EMSCRIPTEN_VERSION=$(cat cli/loader/emscripten-version)" >> $GITHUB_ENV
camel_target=${target//-/_}; target_cc=${target/-unknown/}
printf 'CC_%s=%s\n' "$camel_target" "${target_cc/v7/}-gcc"
printf 'AR_%s=%s\n' "$camel_target" "${target_cc/v7/}-ar"
printf 'CARGO_TARGET_%s_LINKER=%s\n' "${camel_target^^}" "${target_cc/v7/}-gcc"
done >> $GITHUB_ENV
{
printf 'CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER=qemu-arm -L /usr/arm-linux-gnueabihf\n'
printf 'CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64 -L /usr/powerpc64-linux-gnu\n'
} >> $GITHUB_ENV
- name: Get emscripten version
if: contains(matrix.features, 'wasm')
run: printf 'EMSCRIPTEN_VERSION=%s\n' "$(<crates/loader/emscripten-version)" >> $GITHUB_ENV
- name: Install Emscripten - name: Install Emscripten
if: contains(matrix.features, 'wasm') if: ${{ !matrix.cli-only && !matrix.use-cross }}
uses: mymindstorm/setup-emsdk@v14 uses: mymindstorm/setup-emsdk@v14
with: with:
version: ${{ env.EMSCRIPTEN_VERSION }} version: ${{ env.EMSCRIPTEN_VERSION }}
- name: Set up Rust - run: rustup toolchain install stable --profile minimal
uses: actions-rust-lang/setup-rust-toolchain@v1 - run: rustup target add ${{ matrix.target }}
- uses: Swatinem/rust-cache@v2
- name: Install cross
if: ${{ matrix.use-cross }}
uses: taiki-e/install-action@v2
with: with:
target: ${{ matrix.target }} tool: cross
- name: Install cross-compilation toolchain - name: Build custom cross image
if: matrix.cross if: ${{ matrix.use-cross && matrix.os == 'ubuntu-latest' }}
run: | run: |
sudo apt-get update -qy cd ..
if [[ $PLATFORM == linux-arm ]]; then
sudo apt-get install -qy {binutils,gcc}-arm-linux-gnueabihf qemu-user target="${{ matrix.target }}"
elif [[ $PLATFORM == linux-x86 ]]; then image=ghcr.io/cross-rs/$target:custom
sudo apt-get install -qy {binutils,gcc}-i686-linux-gnu echo "CROSS_IMAGE=$image" >> $GITHUB_ENV
elif [[ $PLATFORM == linux-powerpc64 ]]; then
sudo apt-get install -qy {binutils,gcc}-powerpc64-linux-gnu qemu-user echo "[target.$target]" >> Cross.toml
echo "image = \"$image\"" >> Cross.toml
echo "CROSS_CONFIG=$PWD/Cross.toml" >> $GITHUB_ENV
echo "FROM ghcr.io/cross-rs/$target:edge" >> Dockerfile
echo "ENV DEBIAN_FRONTEND=noninteractive" >> Dockerfile
echo "RUN apt-get update && apt-get install -y nodejs" >> Dockerfile
docker build -t $image .
- name: Setup env extras
env:
RUST_TEST_THREADS: ${{ matrix.rust-test-threads || '' }}
USE_CROSS: ${{ matrix.use-cross }}
TARGET: ${{ matrix.target }}
CC: ${{ matrix.cc }}
AR: ${{ matrix.ar }}
IS_WINDOWS: ${{ contains(matrix.os, 'windows') }}
ENABLE_WASM: ${{ matrix.enable-wasm }}
run: |
PATH="$PWD/.github/scripts:$PATH"
echo "$PWD/.github/scripts" >> $GITHUB_PATH
echo "TREE_SITTER=tree-sitter.sh" >> $GITHUB_ENV
echo "TARGET=$TARGET" >> $GITHUB_ENV
echo "ROOT=$PWD" >> $GITHUB_ENV
[ -n "$RUST_TEST_THREADS" ] && \
echo "RUST_TEST_THREADS=$RUST_TEST_THREADS" >> $GITHUB_ENV
[ -n "$CC" ] && echo "CC=$CC" >> $GITHUB_ENV
[ -n "$AR" ] && echo "AR=$AR" >> $GITHUB_ENV
[ "$IS_WINDOWS" = "false" ] && echo "CFLAGS=-Werror" >> $GITHUB_ENV
if [ "$ENABLE_WASM" == "true" ]; then
echo "CLI_FEATURES=wasm" >> $GITHUB_ENV
fi fi
env:
PLATFORM: ${{ matrix.platform }}
- name: Install MinGW and Clang (Windows x64 MSYS2) if [ "$USE_CROSS" == "true" ]; then
if: matrix.platform == 'windows-x64' echo "BUILD_CMD=cross" >> $GITHUB_ENV
uses: msys2/setup-msys2@v2 runner=$(BUILD_CMD=cross cross.sh bash -c "env | sed -nr '/^CARGO_TARGET_.*_RUNNER=/s///p'")
with: [ -n "$runner" ] && echo "CROSS_RUNNER=$runner" >> $GITHUB_ENV
update: true
install: |
mingw-w64-x86_64-toolchain
mingw-w64-x86_64-clang
mingw-w64-x86_64-make
mingw-w64-x86_64-cmake
# TODO: Remove RUSTFLAGS="--cap-lints allow" once we use a wasmtime release that addresses
# the `mismatched-lifetime-syntaxes` lint
- name: Build wasmtime library (Windows x64 MSYS2)
if: contains(matrix.features, 'wasm') && matrix.platform == 'windows-x64'
run: |
mkdir -p target
WASMTIME_VERSION=$(cargo metadata --format-version=1 --locked --features wasm | \
jq -r '.packages[] | select(.name == "wasmtime-c-api-impl") | .version')
curl -LSs "$WASMTIME_REPO/archive/refs/tags/v${WASMTIME_VERSION}.tar.gz" | tar xzf - -C target
cd target/wasmtime-${WASMTIME_VERSION}
cmake -S crates/c-api -B target/c-api \
-DCMAKE_INSTALL_PREFIX="$PWD/artifacts" \
-DWASMTIME_DISABLE_ALL_FEATURES=ON \
-DWASMTIME_FEATURE_CRANELIFT=ON \
-DWASMTIME_TARGET='x86_64-pc-windows-gnu'
cmake --build target/c-api && cmake --install target/c-api
printf 'CMAKE_PREFIX_PATH=%s\n' "$PWD/artifacts" >> $GITHUB_ENV
env:
WASMTIME_REPO: https://github.com/bytecodealliance/wasmtime
RUSTFLAGS: ${{ env.RUSTFLAGS }} --cap-lints allow
- name: Build C library (Windows x64 MSYS2 CMake)
if: matrix.platform == 'windows-x64'
shell: msys2 {0}
run: |
cmake -G Ninja -S . -B build/static \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM \
-DCMAKE_C_COMPILER=clang
cmake --build build/static
cmake -G Ninja -S . -B build/shared \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM \
-DCMAKE_C_COMPILER=clang
cmake --build build/shared
rm -rf \
build/{static,shared} \
"${CMAKE_PREFIX_PATH}/artifacts" \
target/wasmtime-${WASMTIME_VERSION}
env:
WASM: ${{ contains(matrix.features, 'wasm') && 'ON' || 'OFF' }}
# TODO: Remove RUSTFLAGS="--cap-lints allow" once we use a wasmtime release that addresses
# the `mismatched-lifetime-syntaxes` lint
- name: Build wasmtime library
if: contains(matrix.features, 'wasm')
run: |
mkdir -p target
WASMTIME_VERSION=$(cargo metadata --format-version=1 --locked --features wasm | \
jq -r '.packages[] | select(.name == "wasmtime-c-api-impl") | .version')
curl -LSs "$WASMTIME_REPO/archive/refs/tags/v${WASMTIME_VERSION}.tar.gz" | tar xzf - -C target
cd target/wasmtime-${WASMTIME_VERSION}
cmake -S crates/c-api -B target/c-api \
-DCMAKE_INSTALL_PREFIX="$PWD/artifacts" \
-DWASMTIME_DISABLE_ALL_FEATURES=ON \
-DWASMTIME_FEATURE_CRANELIFT=ON \
-DWASMTIME_TARGET='${{ matrix.target }}'
cmake --build target/c-api && cmake --install target/c-api
printf 'CMAKE_PREFIX_PATH=%s\n' "$PWD/artifacts" >> $GITHUB_ENV
env:
WASMTIME_REPO: https://github.com/bytecodealliance/wasmtime
RUSTFLAGS: ${{ env.RUSTFLAGS }} --cap-lints allow
- name: Build C library (make)
if: runner.os != 'Windows'
run: |
if [[ $PLATFORM == linux-arm ]]; then
CC=arm-linux-gnueabihf-gcc; AR=arm-linux-gnueabihf-ar
elif [[ $PLATFORM == linux-x86 ]]; then
CC=i686-linux-gnu-gcc; AR=i686-linux-gnu-ar
elif [[ $PLATFORM == linux-powerpc64 ]]; then
CC=powerpc64-linux-gnu-gcc; AR=powerpc64-linux-gnu-ar
else
CC=gcc; AR=ar
fi fi
make -j CFLAGS="$CFLAGS" CC=$CC AR=$AR
env:
PLATFORM: ${{ matrix.platform }}
CFLAGS: -g -Werror -Wall -Wextra -Wshadow -Wpedantic -Werror=incompatible-pointer-types
- name: Build C library (CMake) - name: Build C library
if: "!matrix.cross" if: ${{ !contains(matrix.os, 'windows') }} # Requires an additional adapted Makefile for `cl.exe` compiler
run: | run: make.sh -j
cmake -S . -B build/static \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM
cmake --build build/static --verbose
cmake -S . -B build/shared \ - name: Build wasm library
-DBUILD_SHARED_LIBS=ON \ if: ${{ !matrix.cli-only && !matrix.use-cross }} # No sense to build on the same Github runner hosts many times
-DCMAKE_BUILD_TYPE=Debug \ run: script/build-wasm
-DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
-DTREE_SITTER_FEATURE_WASM=$WASM
cmake --build build/shared --verbose
env:
CC: ${{ contains(matrix.platform, 'linux') && 'clang' || '' }}
WASM: ${{ contains(matrix.features, 'wasm') && 'ON' || 'OFF' }}
- name: Build Wasm library - name: Build CLI
if: contains(matrix.features, 'wasm') run: $BUILD_CMD build --release --target=${{ matrix.target }} --features=${CLI_FEATURES}
shell: bash
run: |
cd lib/binding_web
npm ci
CJS=true npm run build
CJS=true npm run build:debug
npm run build
npm run build:debug
- name: Check no_std builds
if: inputs.run-test && !matrix.no-run
working-directory: lib
shell: bash
run: cargo check --no-default-features --target='${{ matrix.target }}'
- name: Build target
run: cargo build --release --target='${{ matrix.target }}' --features='${{ matrix.features }}' $PACKAGE
env:
PACKAGE: ${{ matrix.platform == 'wasm32' && '-p tree-sitter' || '' }}
- name: Cache fixtures
id: cache
if: inputs.run-test && !matrix.no-run
uses: ./.github/actions/cache
- name: Fetch fixtures - name: Fetch fixtures
if: inputs.run-test && !matrix.no-run if: ${{ !matrix.cli-only && inputs.run_test }} # Don't fetch fixtures for only CLI building targets
run: cargo run -p xtask --target='${{ matrix.target }}' -- fetch-fixtures run: script/fetch-fixtures
- name: Generate fixtures - name: Generate fixtures
if: inputs.run-test && !matrix.no-run && steps.cache.outputs.cache-hit != 'true' if: ${{ !matrix.cli-only && inputs.run_test }} # Can't natively run CLI on Github runner's host
run: cargo run -p xtask --target='${{ matrix.target }}' -- generate-fixtures run: script/generate-fixtures
- name: Generate Wasm fixtures - name: Generate WASM fixtures
if: inputs.run-test && !matrix.no-run && contains(matrix.features, 'wasm') && steps.cache.outputs.cache-hit != 'true' if: ${{ !matrix.cli-only && !matrix.use-cross && inputs.run_test }} # See comment for the "Build wasm library" step
run: cargo run -p xtask --target='${{ matrix.target }}' -- generate-fixtures --wasm run: script/generate-fixtures-wasm
- name: Run main tests - name: Run main tests
if: inputs.run-test && !matrix.no-run if: ${{ !matrix.cli-only && inputs.run_test }} # Can't natively run CLI on Github runner's host
run: cargo test --target='${{ matrix.target }}' --features='${{ matrix.features }}' run: $BUILD_CMD test --target=${{ matrix.target }} --features=${CLI_FEATURES}
- name: Run Wasm tests - name: Run wasm tests
if: inputs.run-test && !matrix.no-run && contains(matrix.features, 'wasm') if: ${{ !matrix.cli-only && !matrix.use-cross && inputs.run_test }} # See comment for the "Build wasm library" step
run: cargo run -p xtask --target='${{ matrix.target }}' -- test-wasm run: script/test-wasm
- name: Run benchmarks
if: ${{ !matrix.cli-only && !matrix.use-cross && inputs.run_test }} # Cross-compiled benchmarks make no sense
run: $BUILD_CMD bench benchmark -p tree-sitter-cli --target=${{ matrix.target }}
- name: Upload CLI artifact - name: Upload CLI artifact
if: "!matrix.no-run" uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v6
with: with:
name: tree-sitter.${{ matrix.platform }} name: tree-sitter.${{ matrix.platform }}
path: target/${{ matrix.target }}/release/tree-sitter${{ contains(matrix.target, 'windows') && '.exe' || '' }} path: target/${{ matrix.target }}/release/tree-sitter${{ env.EXE }}
if-no-files-found: error if-no-files-found: error
retention-days: 7 retention-days: 7
- name: Upload Wasm artifacts - name: Upload WASM artifacts
if: matrix.platform == 'linux-x64' if: ${{ matrix.platform == 'linux-x64' }}
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: tree-sitter.wasm name: tree-sitter.wasm
path: | path: |
lib/binding_web/web-tree-sitter.js lib/binding_web/tree-sitter.js
lib/binding_web/web-tree-sitter.js.map lib/binding_web/tree-sitter.wasm
lib/binding_web/web-tree-sitter.cjs
lib/binding_web/web-tree-sitter.cjs.map
lib/binding_web/web-tree-sitter.wasm
lib/binding_web/web-tree-sitter.wasm.map
lib/binding_web/debug/web-tree-sitter.cjs
lib/binding_web/debug/web-tree-sitter.cjs.map
lib/binding_web/debug/web-tree-sitter.js
lib/binding_web/debug/web-tree-sitter.js.map
lib/binding_web/debug/web-tree-sitter.wasm
lib/binding_web/debug/web-tree-sitter.wasm.map
lib/binding_web/lib/*.c
lib/binding_web/lib/*.h
lib/binding_web/lib/*.ts
lib/binding_web/src/*.ts
if-no-files-found: error if-no-files-found: error
retention-days: 7 retention-days: 7

24
.github/workflows/checks.yml vendored Normal file
View file

@ -0,0 +1,24 @@
name: Full Rust codebase checks
on:
workflow_call:
jobs:
run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: rustup toolchain install stable --profile minimal
- uses: Swatinem/rust-cache@v2
- run: make lint
check_c_warnings:
name: Check C warnings
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Make C library to check that it's able to compile without warnings
run: make -j CFLAGS="-Werror"

View file

@ -2,48 +2,22 @@ name: CI
on: on:
pull_request: pull_request:
paths-ignore:
- docs/**
- "**/README.md"
- CONTRIBUTING.md
- LICENSE
- cli/src/templates
push: push:
branches: [master] branches:
paths-ignore: - 'master'
- docs/**
- "**/README.md"
- CONTRIBUTING.md
- LICENSE
- cli/src/templates
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }} group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.event_name != 'push' }} cancel-in-progress: true
jobs: jobs:
checks: checks:
runs-on: ubuntu-latest uses: ./.github/workflows/checks.yml
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up stable Rust toolchain
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
components: clippy, rustfmt
- name: Lint files
run: |
make lint
make lint-web
sanitize: sanitize:
needs: checks
uses: ./.github/workflows/sanitize.yml uses: ./.github/workflows/sanitize.yml
build: build:
needs: checks
uses: ./.github/workflows/build.yml uses: ./.github/workflows/build.yml
check-wasm-stdlib:
uses: ./.github/workflows/wasm_stdlib.yml

View file

@ -1,50 +0,0 @@
name: Deploy Docs
on:
push:
branches: [master]
paths: [docs/**]
workflow_dispatch:
jobs:
deploy-docs:
runs-on: ubuntu-latest
permissions:
contents: write
pages: write
id-token: write
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up Rust
uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Install mdbook
env:
GH_TOKEN: ${{ github.token }}
run: |
jq_expr='.assets[] | select(.name | contains("x86_64-unknown-linux-gnu")) | .browser_download_url'
url=$(gh api repos/rust-lang/mdbook/releases/tags/v0.4.52 --jq "$jq_expr")
mkdir mdbook
curl -sSL "$url" | tar -xz -C mdbook
printf '%s/mdbook\n' "$PWD" >> "$GITHUB_PATH"
- name: Install mdbook-admonish
run: cargo install mdbook-admonish
- name: Build Book
run: mdbook build docs
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v4
with:
path: docs/book
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

View file

@ -1,69 +0,0 @@
name: nvim-treesitter parser tests
on:
pull_request:
paths:
- 'crates/cli/**'
- 'crates/config/**'
- 'crates/generate/**'
- 'crates/loader/**'
- '.github/workflows/nvim_ts.yml'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check_compilation:
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
type: [generate, build]
name: ${{ matrix.os }} - ${{ matrix.type }}
runs-on: ${{ matrix.os }}
env:
NVIM: ${{ matrix.os == 'windows-latest' && 'nvim-win64\\bin\\nvim.exe' || 'nvim' }}
NVIM_TS_DIR: nvim-treesitter
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@v6
with:
repository: nvim-treesitter/nvim-treesitter
path: ${{ env.NVIM_TS_DIR }}
ref: main
- if: runner.os != 'Windows'
run: echo ${{ github.workspace }}/target/release >> $GITHUB_PATH
- if: runner.os == 'Windows'
run: echo ${{ github.workspace }}/target/release >> $env:GITHUB_PATH
- uses: actions-rust-lang/setup-rust-toolchain@v1
- run: cargo build --release
- uses: ilammy/msvc-dev-cmd@v1
- name: Install and prepare Neovim
run: bash ./scripts/ci-install.sh
working-directory: ${{ env.NVIM_TS_DIR }}
- if: matrix.type == 'generate'
name: Generate and compile parsers
run: $NVIM -l ./scripts/install-parsers.lua --generate --max-jobs=2
working-directory: ${{ env.NVIM_TS_DIR }}
shell: bash
- if: matrix.type == 'build'
name: Compile parsers
run: $NVIM -l ./scripts/install-parsers.lua --max-jobs=10
working-directory: ${{ env.NVIM_TS_DIR }}
shell: bash
- if: "!cancelled()"
name: Check query files
run: $NVIM -l ./scripts/check-queries.lua
working-directory: ${{ env.NVIM_TS_DIR }}
shell: bash

View file

@ -1,5 +1,4 @@
name: Release name: Release
on: on:
workflow_dispatch: workflow_dispatch:
push: push:
@ -10,22 +9,19 @@ jobs:
build: build:
uses: ./.github/workflows/build.yml uses: ./.github/workflows/build.yml
with: with:
run-test: false run_test: false
release: release:
name: Release on GitHub name: Release
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: build needs: build
permissions: permissions:
id-token: write
attestations: write
contents: write contents: write
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Download build artifacts - name: Download build artifacts
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
path: artifacts path: artifacts
@ -35,13 +31,9 @@ jobs:
- name: Prepare release artifacts - name: Prepare release artifacts
run: | run: |
mkdir -p target web mkdir -p target
mv artifacts/tree-sitter.wasm/* web/ mv artifacts/tree-sitter.wasm/* target/
tar -czf target/web-tree-sitter.tar.gz -C web .
rm -r artifacts/tree-sitter.wasm rm -r artifacts/tree-sitter.wasm
for platform in $(cd artifacts; ls | sed 's/^tree-sitter\.//'); do for platform in $(cd artifacts; ls | sed 's/^tree-sitter\.//'); do
exe=$(ls artifacts/tree-sitter.$platform/tree-sitter*) exe=$(ls artifacts/tree-sitter.$platform/tree-sitter*)
gzip --stdout --name $exe > target/tree-sitter-$platform.gz gzip --stdout --name $exe > target/tree-sitter-$platform.gz
@ -49,81 +41,56 @@ jobs:
rm -rf artifacts rm -rf artifacts
ls -l target/ ls -l target/
- name: Generate attestations
uses: actions/attest-build-provenance@v3
with:
subject-path: |
target/tree-sitter-*.gz
target/web-tree-sitter.tar.gz
- name: Create release - name: Create release
run: |- uses: softprops/action-gh-release@v1
gh release create $GITHUB_REF_NAME \ with:
target/tree-sitter-*.gz \ name: ${{ github.ref_name }}
target/web-tree-sitter.tar.gz tag_name: ${{ github.ref_name }}
env: fail_on_unmatched_files: true
GH_TOKEN: ${{ github.token }} files: |
target/tree-sitter-*.gz
target/tree-sitter.wasm
target/tree-sitter.js
crates_io: crates_io:
name: Publish packages to Crates.io name: Publish CLI to Crates.io
runs-on: ubuntu-latest runs-on: ubuntu-latest
environment: crates
permissions:
id-token: write
contents: read
needs: release needs: release
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set up Rust - name: Setup Rust
uses: actions-rust-lang/setup-rust-toolchain@v1 uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Set up registry token - name: Publish CLI to Crates.io
id: auth
uses: rust-lang/crates-io-auth-action@v1
- name: Publish crates to Crates.io
uses: katyo/publish-crates@v2 uses: katyo/publish-crates@v2
with: with:
registry-token: ${{ steps.auth.outputs.token }} registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}
npm: npm:
name: Publish packages to npmjs.com name: Publish lib to npmjs.com
runs-on: ubuntu-latest runs-on: ubuntu-latest
environment: npm
permissions:
id-token: write
contents: read
needs: release needs: release
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
directory: [crates/cli/npm, lib/binding_web] directory: ["cli/npm", "lib/binding_web"]
steps: steps:
- name: Checkout repository - uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set up Node - name: Setup Node
uses: actions/setup-node@v6 uses: actions/setup-node@v4
with: with:
node-version: 24 node-version: 18
registry-url: https://registry.npmjs.org registry-url: "https://registry.npmjs.org"
- name: Set up Rust - name: Publish lib to npmjs.com
uses: actions-rust-lang/setup-rust-toolchain@v1 env:
NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}}
- name: Build wasm
if: matrix.directory == 'lib/binding_web'
run: | run: |
cd ${{ matrix.directory }} cd ${{ matrix.directory }}
npm ci npm publish
npm run build
npm run build:debug
CJS=true npm run build
CJS=true npm run build:debug
npm run build:dts
- name: Publish to npmjs.com
working-directory: ${{ matrix.directory }}
run: npm publish

View file

@ -1,47 +1,34 @@
name: No response name: no_response
on: on:
schedule: schedule:
- cron: "30 1 * * *" # Run every day at 01:30 - cron: '30 1 * * *' # Run every day at 01:30
workflow_dispatch: workflow_dispatch:
issue_comment: issue_comment:
permissions:
issues: write
pull-requests: write
jobs: jobs:
close: close:
name: Close issues with no response
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps: steps:
- name: Checkout script - uses: actions/checkout@v4
uses: actions/checkout@v6 - uses: actions/github-script@v7
with:
sparse-checkout: .github/scripts/close_unresponsive.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with: with:
script: | script: |
const script = require('./.github/scripts/close_unresponsive.js') const script = require('./.github/scripts/close_unresponsive.js')
await script({github, context}) await script({github, context})
remove_label: remove_label:
name: Remove response label
if: github.event_name == 'issue_comment' if: github.event_name == 'issue_comment'
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps: steps:
- name: Checkout script - uses: actions/checkout@v4
uses: actions/checkout@v6 - uses: actions/github-script@v7
with:
sparse-checkout: .github/scripts/remove_response_label.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with: with:
script: | script: |
const script = require('./.github/scripts/remove_response_label.js') const script = require('./.github/scripts/remove_response_label.js')

View file

@ -1,25 +0,0 @@
name: Remove Reviewers
on:
pull_request_target:
types: [converted_to_draft, closed]
permissions:
pull-requests: write
jobs:
remove-reviewers:
runs-on: ubuntu-latest
steps:
- name: Checkout script
uses: actions/checkout@v6
with:
sparse-checkout: .github/scripts/reviewers_remove.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with:
script: |
const script = require('./.github/scripts/reviewers_remove.js')
await script({github, context})

View file

@ -2,50 +2,49 @@ name: Sanitize
env: env:
CARGO_TERM_COLOR: always CARGO_TERM_COLOR: always
RUSTFLAGS: -D warnings RUSTFLAGS: "-D warnings"
on: on:
workflow_call: workflow_call:
jobs: jobs:
check-undefined-behaviour: check_undefined_behaviour:
name: Sanitizer checks
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 20
env: env:
TREE_SITTER: ${{ github.workspace }}/target/release/tree-sitter TREE_SITTER: ${{ github.workspace }}/target/release/tree-sitter
steps: steps:
- name: Checkout repository - name: Checkout source code
uses: actions/checkout@v6 uses: actions/checkout@v4
- name: Install UBSAN library - name: Install UBSAN library
run: sudo apt-get update -y && sudo apt-get install -y libubsan1 run: sudo apt-get update -y && sudo apt-get install -y libubsan1
- name: Set up Rust - run: rustup toolchain install stable --profile minimal
uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: Swatinem/rust-cache@v2
- name: Build project - name: Build CLI
run: cargo build --release run: cargo build --release
- name: Cache fixtures - name: Fetch fixtures
uses: ./.github/actions/cache run: script/fetch-fixtures
id: cache
- name: Fetch fixtures - name: Generate fixtures
run: cargo xtask fetch-fixtures run: script/generate-fixtures
- name: Generate fixtures - name: Run main tests with undefined behaviour sanitizer (UBSAN)
if: ${{ steps.cache.outputs.cache-hit != 'true' }} env:
run: cargo xtask generate-fixtures UBSAN_OPTIONS: halt_on_error=1
CFLAGS: -fsanitize=undefined
RUSTFLAGS: ${{ env.RUSTFLAGS }} -lubsan
run: cargo test -- --test-threads 1
- name: Run main tests with undefined behaviour sanitizer (UBSAN) - name: Run main tests with address sanitizer (ASAN)
run: cargo test -- --test-threads 1 env:
env: ASAN_OPTIONS: halt_on_error=1
CFLAGS: -fsanitize=undefined CFLAGS: -fsanitize=address
RUSTFLAGS: ${{ env.RUSTFLAGS }} -lubsan RUSTFLAGS: ${{ env.RUSTFLAGS }} -Zsanitizer=address --cfg=sanitizing
run: |
- name: Run main tests with address sanitizer (ASAN) rustup install nightly
run: cargo test -- --test-threads 1 rustup component add rust-src --toolchain nightly-x86_64-unknown-linux-gnu
env: cargo +nightly test -Z build-std --target x86_64-unknown-linux-gnu -- --test-threads 1
ASAN_OPTIONS: verify_asan_link_order=0
CFLAGS: -fsanitize=address
RUSTFLAGS: ${{ env.RUSTFLAGS }} -lasan --cfg sanitizing

View file

@ -1,29 +0,0 @@
name: Close as spam
on:
issues:
types: [labeled]
pull_request_target:
types: [labeled]
permissions:
issues: write
pull-requests: write
jobs:
spam:
runs-on: ubuntu-latest
if: github.event.label.name == 'spam'
steps:
- name: Checkout script
uses: actions/checkout@v6
with:
sparse-checkout: .github/scripts/close_spam.js
sparse-checkout-cone-mode: false
- name: Run script
uses: actions/github-script@v8
with:
script: |
const script = require('./.github/scripts/close_spam.js')
await script({github, context})

View file

@ -1,41 +0,0 @@
name: Check Wasm Exports
on:
pull_request:
paths:
- lib/include/tree_sitter/api.h
- lib/binding_web/**
- xtask/src/**
push:
branches: [master]
paths:
- lib/include/tree_sitter/api.h
- lib/binding_rust/bindings.rs
- CMakeLists.txt
jobs:
check-wasm-exports:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up stable Rust toolchain
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
- name: Install wasm-objdump
run: sudo apt-get update -y && sudo apt-get install -y wabt
- name: Build C library (make)
run: make -j CFLAGS="$CFLAGS"
env:
CFLAGS: -g -Werror -Wall -Wextra -Wshadow -Wpedantic -Werror=incompatible-pointer-types
- name: Build Wasm Library
working-directory: lib/binding_web
run: npm ci && npm run build:debug
- name: Check Wasm exports
run: cargo xtask check-wasm-exports

View file

@ -1,19 +0,0 @@
name: Check Wasm Stdlib build
on:
workflow_call:
jobs:
check:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Check directory changes
uses: actions/github-script@v8
with:
script: |
const scriptPath = `${process.env.GITHUB_WORKSPACE}/.github/scripts/wasm_stdlib.js`;
const script = require(scriptPath);
return script({ github, context, core });

16
.gitignore vendored
View file

@ -1,40 +1,30 @@
log*.html log*.html
.direnv
.idea .idea
*.xcodeproj *.xcodeproj
.vscode .vscode
.cache .cache
.zig-cache
.direnv
profile*
fuzz-results fuzz-results
test/fuzz/out
/tree-sitter.pc
test/fixtures/grammars/* test/fixtures/grammars/*
!test/fixtures/grammars/.gitkeep !test/fixtures/grammars/.gitkeep
package-lock.json
node_modules node_modules
docs/assets/js/tree-sitter.js docs/assets/js/tree-sitter.js
/tree-sitter.pc
/target /target
*.rs.bk *.rs.bk
*.a *.a
*.dylib *.dylib
*.so *.so
*.so.[0-9]* *.so.[0-9]*
*.dll
*.o *.o
*.obj *.obj
*.exp *.exp
*.lib *.lib
*.wasm *.wasm
.swiftpm .swiftpm
.build
build
zig-* zig-*
/result

View file

@ -1,11 +0,0 @@
{
"lsp": {
"rust-analyzer": {
"initialization_options": {
"cargo": {
"features": "all"
}
}
}
}
}

View file

@ -1,95 +0,0 @@
cmake_minimum_required(VERSION 3.13)
project(tree-sitter
VERSION "0.27.0"
DESCRIPTION "An incremental parsing system for programming tools"
HOMEPAGE_URL "https://tree-sitter.github.io/tree-sitter/"
LANGUAGES C)
option(BUILD_SHARED_LIBS "Build using shared libraries" ON)
option(TREE_SITTER_FEATURE_WASM "Enable the Wasm feature" OFF)
option(AMALGAMATED "Build using an amalgamated source" OFF)
if(AMALGAMATED)
set(TS_SOURCE_FILES "${PROJECT_SOURCE_DIR}/lib/src/lib.c")
else()
file(GLOB TS_SOURCE_FILES lib/src/*.c)
list(REMOVE_ITEM TS_SOURCE_FILES "${PROJECT_SOURCE_DIR}/lib/src/lib.c")
endif()
add_library(tree-sitter ${TS_SOURCE_FILES})
target_include_directories(tree-sitter PRIVATE lib/src lib/src/wasm PUBLIC lib/include)
if(MSVC)
target_compile_options(tree-sitter PRIVATE
/wd4018 # disable 'signed/unsigned mismatch'
/wd4232 # disable 'nonstandard extension used'
/wd4244 # disable 'possible loss of data'
/wd4267 # disable 'possible loss of data (size_t)'
/wd4701 # disable 'potentially uninitialized local variable'
/we4022 # treat 'incompatible types' as an error
/W4)
else()
target_compile_options(tree-sitter PRIVATE
-Wall -Wextra -Wshadow -Wpedantic
-Werror=incompatible-pointer-types)
endif()
if(TREE_SITTER_FEATURE_WASM)
if(NOT DEFINED CACHE{WASMTIME_INCLUDE_DIR})
message(CHECK_START "Looking for wasmtime headers")
find_path(WASMTIME_INCLUDE_DIR wasmtime.h
PATHS ENV DEP_WASMTIME_C_API_INCLUDE)
if(NOT WASMTIME_INCLUDE_DIR)
unset(WASMTIME_INCLUDE_DIR CACHE)
message(FATAL_ERROR "Could not find wasmtime headers.\nDid you forget to set CMAKE_INCLUDE_PATH?")
endif()
message(CHECK_PASS "found")
endif()
if(NOT DEFINED CACHE{WASMTIME_LIBRARY})
message(CHECK_START "Looking for wasmtime library")
find_library(WASMTIME_LIBRARY wasmtime)
if(NOT WASMTIME_LIBRARY)
unset(WASMTIME_LIBRARY CACHE)
message(FATAL_ERROR "Could not find wasmtime library.\nDid you forget to set CMAKE_LIBRARY_PATH?")
endif()
message(CHECK_PASS "found")
endif()
target_compile_definitions(tree-sitter PUBLIC TREE_SITTER_FEATURE_WASM)
target_include_directories(tree-sitter SYSTEM PRIVATE "${WASMTIME_INCLUDE_DIR}")
target_link_libraries(tree-sitter PUBLIC "${WASMTIME_LIBRARY}")
set_property(TARGET tree-sitter PROPERTY C_STANDARD_REQUIRED ON)
if(NOT BUILD_SHARED_LIBS)
if(WIN32)
target_compile_definitions(tree-sitter PRIVATE WASM_API_EXTERN= WASI_API_EXTERN=)
target_link_libraries(tree-sitter INTERFACE ws2_32 advapi32 userenv ntdll shell32 ole32 bcrypt)
elseif(NOT APPLE)
target_link_libraries(tree-sitter INTERFACE pthread dl m)
endif()
endif()
endif()
set_target_properties(tree-sitter
PROPERTIES
C_STANDARD 11
C_VISIBILITY_PRESET hidden
POSITION_INDEPENDENT_CODE ON
SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}"
DEFINE_SYMBOL "")
target_compile_definitions(tree-sitter PRIVATE _POSIX_C_SOURCE=200112L _DEFAULT_SOURCE _BSD_SOURCE _DARWIN_C_SOURCE)
include(GNUInstallDirs)
configure_file(lib/tree-sitter.pc.in "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc" @ONLY)
install(FILES lib/include/tree_sitter/api.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/tree_sitter")
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
install(TARGETS tree-sitter
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")

View file

@ -1 +0,0 @@
See [docs/src/6-contributing.md](./docs/src/6-contributing.md)

1
CONTRIBUTING.md Symbolic link
View file

@ -0,0 +1 @@
docs/section-6-contributing.md

2376
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,85 +1,10 @@
[workspace] [workspace]
default-members = ["crates/cli"] default-members = ["cli"]
members = [ members = ["cli", "cli/config", "cli/loader", "lib", "tags", "highlight"]
"crates/cli",
"crates/config",
"crates/generate",
"crates/highlight",
"crates/loader",
"crates/tags",
"crates/xtask",
"crates/language",
"lib",
]
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "0.27.0" rust-version = "1.70"
authors = [
"Max Brunsfeld <maxbrunsfeld@gmail.com>",
"Amaan Qureshi <amaanq12@gmail.com>",
]
edition = "2021"
rust-version = "1.85"
homepage = "https://tree-sitter.github.io/tree-sitter"
repository = "https://github.com/tree-sitter/tree-sitter"
license = "MIT"
keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"]
[workspace.lints.clippy]
dbg_macro = "deny"
todo = "deny"
pedantic = { level = "warn", priority = -1 }
nursery = { level = "warn", priority = -1 }
cargo = { level = "warn", priority = -1 }
# The lints below are a specific subset of the pedantic+nursery lints
# that we explicitly allow in the tree-sitter codebase because they either:
#
# 1. Contain false positives,
# 2. Are unnecessary, or
# 3. Worsen the code
branches_sharing_code = "allow"
cast_lossless = "allow"
cast_possible_truncation = "allow"
cast_possible_wrap = "allow"
cast_precision_loss = "allow"
cast_sign_loss = "allow"
checked_conversions = "allow"
cognitive_complexity = "allow"
collection_is_never_read = "allow"
fallible_impl_from = "allow"
fn_params_excessive_bools = "allow"
inline_always = "allow"
if_not_else = "allow"
items_after_statements = "allow"
match_wildcard_for_single_variants = "allow"
missing_errors_doc = "allow"
missing_panics_doc = "allow"
module_name_repetitions = "allow"
multiple_crate_versions = "allow"
needless_for_each = "allow"
obfuscated_if_else = "allow"
option_if_let_else = "allow"
or_fun_call = "allow"
range_plus_one = "allow"
redundant_clone = "allow"
redundant_closure_for_method_calls = "allow"
ref_option = "allow"
similar_names = "allow"
string_lit_as_bytes = "allow"
struct_excessive_bools = "allow"
struct_field_names = "allow"
transmute_undefined_repr = "allow"
too_many_lines = "allow"
unnecessary_wraps = "allow"
unused_self = "allow"
used_underscore_items = "allow"
[workspace.lints.rust]
mismatched_lifetime_syntaxes = "allow"
[profile.optimize] [profile.optimize]
inherits = "release" inherits = "release"
@ -92,72 +17,52 @@ codegen-units = 1 # Maximum size reduction optimizations.
inherits = "optimize" inherits = "optimize"
opt-level = "s" # Optimize for size. opt-level = "s" # Optimize for size.
[profile.release-dev] [profile.profile]
inherits = "release" inherits = "optimize"
lto = false strip = false
debug = true
debug-assertions = true
overflow-checks = true
incremental = true
codegen-units = 256
[workspace.dependencies] [workspace.dependencies]
ansi_colours = "1.2.3" ansi_term = "0.12.1"
anstyle = "1.0.13" anstyle = "1.0.6"
anyhow = "1.0.100" anyhow = "1.0.79"
bstr = "1.12.0" cc = "1.0.83"
cc = "1.2.53" clap = { version = "4.4.18", features = [
clap = { version = "4.5.54", features = [ "cargo",
"cargo", "derive",
"derive", "env",
"env", "help",
"help", "unstable-styles",
"string",
"unstable-styles",
] } ] }
clap_complete = "4.5.65" ctor = "0.2.6"
clap_complete_nushell = "4.5.10" ctrlc = { version = "3.4.2", features = ["termination"] }
crc32fast = "1.5.0" difference = "2.0.0"
ctor = "0.2.9" dirs = "5.0.1"
ctrlc = { version = "3.5.0", features = ["termination"] } glob = "0.3.1"
dialoguer = { version = "0.11.0", features = ["fuzzy-select"] }
etcetera = "0.11.0"
fs4 = "0.12.0"
glob = "0.3.3"
heck = "0.5.0"
html-escape = "0.2.13" html-escape = "0.2.13"
indexmap = "2.12.1" indexmap = "2.2.2"
indoc = "2.0.6" indoc = "2.0.4"
libloading = "0.9.0" lazy_static = "1.4.0"
log = { version = "0.4.28", features = ["std"] } libloading = "0.8.1"
memchr = "2.7.6" log = { version = "0.4.20", features = ["std"] }
once_cell = "1.21.3" memchr = "2.7.1"
pretty_assertions = "1.4.1" once_cell = "1.19.0"
path-slash = "0.2.1"
pretty_assertions = "1.4.0"
rand = "0.8.5" rand = "0.8.5"
regex = "1.11.3" regex = "1.10.3"
regex-syntax = "0.8.6" regex-syntax = "0.8.2"
rustc-hash = "2.1.1" rustc-hash = "1.1.0"
schemars = "1.0.5" semver = "1.0.21"
semver = { version = "1.0.27", features = ["serde"] } serde = { version = "1.0.196", features = ["derive"] }
serde = { version = "1.0.219", features = ["derive"] } serde_derive = "1.0.196"
serde_json = { version = "1.0.149", features = ["preserve_order"] } serde_json = { version = "1.0.113", features = ["preserve_order"] }
similar = "2.7.0" smallbitvec = "2.5.1"
smallbitvec = "2.6.0" tempfile = "3.10.0"
streaming-iterator = "0.1.9" thiserror = "1.0.56"
tempfile = "3.23.0"
thiserror = "2.0.17"
tiny_http = "0.12.0" tiny_http = "0.12.0"
topological-sort = "0.2.2" toml = "0.8.10"
unindent = "0.2.4" unindent = "0.2.3"
walkdir = "2.5.0" walkdir = "2.4.0"
wasmparser = "0.243.0" wasmparser = "0.121.0"
webbrowser = "1.0.5" webbrowser = "0.8.12"
which = "6.0.0"
tree-sitter = { version = "0.27.0", path = "./lib" }
tree-sitter-generate = { version = "0.27.0", path = "./crates/generate" }
tree-sitter-loader = { version = "0.27.0", path = "./crates/loader" }
tree-sitter-config = { version = "0.27.0", path = "./crates/config" }
tree-sitter-highlight = { version = "0.27.0", path = "./crates/highlight" }
tree-sitter-tags = { version = "0.27.0", path = "./crates/tags" }
tree-sitter-language = { version = "0.1", path = "./crates/language" }

View file

@ -1,10 +0,0 @@
FROM rust:1.76-buster
WORKDIR /app
RUN apt-get update
RUN apt-get install -y nodejs
COPY . .
CMD cargo test --all-features

View file

@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2018 Max Brunsfeld Copyright (c) 2018-2023 Max Brunsfeld
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

121
Makefile
View file

@ -1,12 +1,9 @@
VERSION := 0.27.0 VERSION := 0.20.10
DESCRIPTION := An incremental parsing system for programming tools
HOMEPAGE_URL := https://tree-sitter.github.io/tree-sitter/
# install directory layout # install directory layout
PREFIX ?= /usr/local PREFIX ?= /usr/local
INCLUDEDIR ?= $(PREFIX)/include INCLUDEDIR ?= $(PREFIX)/include
LIBDIR ?= $(PREFIX)/lib LIBDIR ?= $(PREFIX)/lib
BINDIR ?= $(PREFIX)/bin
PCLIBDIR ?= $(LIBDIR)/pkgconfig PCLIBDIR ?= $(LIBDIR)/pkgconfig
# collect sources # collect sources
@ -21,119 +18,83 @@ endif
OBJ := $(SRC:.c=.o) OBJ := $(SRC:.c=.o)
# define default flags, and override to append mandatory flags # define default flags, and override to append mandatory flags
ARFLAGS := rcs override CFLAGS := -O3 -std=gnu11 -fPIC -fvisibility=hidden -Wall -Wextra -Wshadow -pedantic $(CFLAGS)
CFLAGS ?= -O3 -Wall -Wextra -Wshadow -Wpedantic -Werror=incompatible-pointer-types
override CFLAGS += -std=c11 -fPIC -fvisibility=hidden
override CFLAGS += -D_POSIX_C_SOURCE=200112L -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_DARWIN_C_SOURCE
override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include
# ABI versioning # ABI versioning
SONAME_MAJOR := $(word 1,$(subst ., ,$(VERSION))) SONAME_MAJOR := 0
SONAME_MINOR := $(word 2,$(subst ., ,$(VERSION))) SONAME_MINOR := 0
# OS-specific bits # OS-specific bits
MACHINE := $(shell $(CC) -dumpmachine) ifeq ($(shell uname),Darwin)
ifneq ($(findstring darwin,$(MACHINE)),)
SOEXT = dylib SOEXT = dylib
SOEXTVER_MAJOR = $(SONAME_MAJOR).$(SOEXT) SOEXTVER_MAJOR = $(SONAME_MAJOR).dylib
SOEXTVER = $(SONAME_MAJOR).$(SONAME_MINOR).$(SOEXT) SOEXTVER = $(SONAME_MAJOR).$(SONAME_MINOR).dylib
LINKSHARED += -dynamiclib -Wl,-install_name,$(LIBDIR)/libtree-sitter.$(SOEXTVER) LINKSHARED += -dynamiclib -Wl,-install_name,$(LIBDIR)/libtree-sitter.$(SONAME_MAJOR).dylib
else ifneq ($(findstring mingw32,$(MACHINE)),)
SOEXT = dll
LINKSHARED += -s -shared -Wl,--out-implib,libtree-sitter.dll.a
else else
SOEXT = so SOEXT = so
SOEXTVER_MAJOR = $(SOEXT).$(SONAME_MAJOR) SOEXTVER_MAJOR = so.$(SONAME_MAJOR)
SOEXTVER = $(SOEXT).$(SONAME_MAJOR).$(SONAME_MINOR) SOEXTVER = so.$(SONAME_MAJOR).$(SONAME_MINOR)
LINKSHARED += -shared -Wl,-soname,libtree-sitter.$(SOEXTVER) LINKSHARED += -shared -Wl,-soname,libtree-sitter.so.$(SONAME_MAJOR)
ifneq ($(filter $(shell uname),FreeBSD NetBSD DragonFly),) endif
ifneq (,$(filter $(shell uname),FreeBSD NetBSD DragonFly))
PCLIBDIR := $(PREFIX)/libdata/pkgconfig PCLIBDIR := $(PREFIX)/libdata/pkgconfig
endif endif
endif
all: libtree-sitter.a libtree-sitter.$(SOEXT) tree-sitter.pc all: libtree-sitter.a libtree-sitter.$(SOEXTVER)
libtree-sitter.a: $(OBJ) libtree-sitter.a: $(OBJ)
$(AR) $(ARFLAGS) $@ $^ $(AR) rcs $@ $^
libtree-sitter.$(SOEXT): $(OBJ) libtree-sitter.$(SOEXTVER): $(OBJ)
$(CC) $(LDFLAGS) $(LINKSHARED) $^ $(LDLIBS) -o $@ $(CC) $(LDFLAGS) $(LINKSHARED) $^ $(LDLIBS) -o $@
ln -sf $@ libtree-sitter.$(SOEXT)
ln -sf $@ libtree-sitter.$(SOEXTVER_MAJOR)
ifneq ($(STRIP),) ifneq ($(STRIP),)
$(STRIP) $@ $(STRIP) $@
endif endif
ifneq ($(findstring mingw32,$(MACHINE)),) install: all
libtree-sitter.dll.a: libtree-sitter.$(SOEXT) sed -e 's|@LIBDIR@|$(LIBDIR)|;s|@INCLUDEDIR@|$(INCLUDEDIR)|;s|@VERSION@|$(VERSION)|' \
endif -e 's|=$(PREFIX)|=$${prefix}|' \
-e 's|@PREFIX@|$(PREFIX)|' \
tree-sitter.pc.in > tree-sitter.pc
tree-sitter.pc: lib/tree-sitter.pc.in install -d '$(DESTDIR)$(LIBDIR)'
sed -e 's|@PROJECT_VERSION@|$(VERSION)|' \ install -m644 libtree-sitter.a '$(DESTDIR)$(LIBDIR)'/
-e 's|@CMAKE_INSTALL_LIBDIR@|$(LIBDIR:$(PREFIX)/%=%)|' \ install -m755 libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/
-e 's|@CMAKE_INSTALL_INCLUDEDIR@|$(INCLUDEDIR:$(PREFIX)/%=%)|' \ ln -sf libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER_MAJOR)
-e 's|@PROJECT_DESCRIPTION@|$(DESCRIPTION)|' \ ln -sf libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXT)
-e 's|@PROJECT_HOMEPAGE_URL@|$(HOMEPAGE_URL)|' \
-e 's|@CMAKE_INSTALL_PREFIX@|$(PREFIX)|' $< > $@
shared: libtree-sitter.$(SOEXT) install -d '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter
install -m644 lib/include/tree_sitter/api.h '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/
static: libtree-sitter.a install -d '$(DESTDIR)$(PCLIBDIR)'
install -m644 tree-sitter.pc '$(DESTDIR)$(PCLIBDIR)'/
clean: clean:
$(RM) $(OBJ) tree-sitter.pc libtree-sitter.a libtree-sitter.$(SOEXT) libtree-stitter.dll.a rm -f lib/src/*.o libtree-sitter.a libtree-sitter.$(SOEXT) libtree-sitter.$(SOEXTVER_MAJOR) libtree-sitter.$(SOEXTVER)
install: all .PHONY: all install clean
install -d '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter '$(DESTDIR)$(PCLIBDIR)' '$(DESTDIR)$(LIBDIR)'
install -m644 lib/include/tree_sitter/api.h '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/api.h
install -m644 tree-sitter.pc '$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc
install -m644 libtree-sitter.a '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a
ifneq ($(findstring mingw32,$(MACHINE)),)
install -d '$(DESTDIR)$(BINDIR)'
install -m755 libtree-sitter.dll '$(DESTDIR)$(BINDIR)'/libtree-sitter.dll
install -m755 libtree-sitter.dll.a '$(DESTDIR)$(LIBDIR)'/libtree-sitter.dll.a
else
install -m755 libtree-sitter.$(SOEXT) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER)
cd '$(DESTDIR)$(LIBDIR)' && ln -sf libtree-sitter.$(SOEXTVER) libtree-sitter.$(SOEXTVER_MAJOR)
cd '$(DESTDIR)$(LIBDIR)' && ln -sf libtree-sitter.$(SOEXTVER_MAJOR) libtree-sitter.$(SOEXT)
endif
uninstall:
$(RM) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a \
'$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER) \
'$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER_MAJOR) \
'$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXT) \
'$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/api.h \
'$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc
rmdir '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter
.PHONY: all shared static install uninstall clean
##### Dev targets ##### ##### Dev targets #####
test: test:
cargo xtask fetch-fixtures script/fetch-fixtures
cargo xtask generate-fixtures script/generate-fixtures
cargo xtask test script/test
test-wasm: test_wasm:
cargo xtask generate-fixtures --wasm script/generate-fixtures-wasm
cargo xtask test-wasm script/test-wasm
lint: lint:
cargo update --workspace --locked --quiet
cargo check --workspace --all-targets cargo check --workspace --all-targets
cargo fmt --all --check cargo fmt --all --check
cargo clippy --workspace --all-targets -- -D warnings cargo clippy --workspace --all-targets -- -D warnings
lint-web:
npm --prefix lib/binding_web ci
npm --prefix lib/binding_web run lint
format: format:
cargo fmt --all cargo fmt --all
changelog: .PHONY: test test_wasm lint format
@git-cliff --config .github/cliff.toml --prepend CHANGELOG.md --latest --github-token $(shell gh auth token)
.PHONY: test test-wasm lint format changelog

View file

@ -15,21 +15,25 @@ let package = Package(
.target(name: "TreeSitter", .target(name: "TreeSitter",
path: "lib", path: "lib",
exclude: [ exclude: [
"src/unicode/ICU_SHA", "binding_rust",
"src/unicode/README.md", "binding_web",
"src/unicode/LICENSE", "Cargo.toml",
"src/wasm/stdlib-symbols.txt", "README.md",
"src/lib.c", "src/unicode/README.md",
"src/unicode/LICENSE",
"src/unicode/ICU_SHA",
"src/get_changed_ranges.c",
"src/tree_cursor.c",
"src/stack.c",
"src/node.c",
"src/lexer.c",
"src/parser.c",
"src/language.c",
"src/alloc.c",
"src/subtree.c",
"src/tree.c",
"src/query.c"
], ],
sources: ["src"], sources: ["src/lib.c"]),
publicHeadersPath: "include", ]
cSettings: [
.headerSearchPath("src"),
.define("_POSIX_C_SOURCE", to: "200112L"),
.define("_DEFAULT_SOURCE"),
.define("_BSD_SOURCE"),
.define("_DARWIN_C_SOURCE"),
]),
],
cLanguageStandard: .c11
) )

View file

@ -1,8 +1,6 @@
# tree-sitter # tree-sitter
[![DOI](https://zenodo.org/badge/14164618.svg)](https://zenodo.org/badge/latestdoi/14164618) [![DOI](https://zenodo.org/badge/14164618.svg)](https://zenodo.org/badge/latestdoi/14164618)
[![discord][discord]](https://discord.gg/w7nTvsVJhm)
[![matrix][matrix]](https://matrix.to/#/#tree-sitter-chat:matrix.org)
Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be: Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be:
@ -12,10 +10,8 @@ Tree-sitter is a parser generator tool and an incremental parsing library. It ca
- **Dependency-free** so that the runtime library (which is written in pure C) can be embedded in any application - **Dependency-free** so that the runtime library (which is written in pure C) can be embedded in any application
## Links ## Links
- [Documentation](https://tree-sitter.github.io) - [Documentation](https://tree-sitter.github.io)
- [Rust binding](lib/binding_rust/README.md) - [Rust binding](lib/binding_rust/README.md)
- [Wasm binding](lib/binding_web/README.md) - [WASM binding](lib/binding_web/README.md)
- [Command-line interface](crates/cli/README.md) - [Command-line interface](cli/README.md)
[discord]: https://img.shields.io/discord/1063097320771698699?logo=discord&label=discord
[matrix]: https://img.shields.io/matrix/tree-sitter-chat%3Amatrix.org?logo=matrix&label=matrix

142
build.zig
View file

@ -1,142 +1,16 @@
const std = @import("std"); const std = @import("std");
pub fn build(b: *std.Build) !void { pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{}); var lib = b.addStaticLibrary(.{
const optimize = b.standardOptimizeOption(.{});
const wasm = b.option(bool, "enable-wasm", "Enable Wasm support") orelse false;
const shared = b.option(bool, "build-shared", "Build a shared library") orelse false;
const amalgamated = b.option(bool, "amalgamated", "Build using an amalgamated source") orelse false;
const lib: *std.Build.Step.Compile = b.addLibrary(.{
.name = "tree-sitter", .name = "tree-sitter",
.linkage = if (shared) .dynamic else .static, .target = b.standardTargetOptions(.{}),
.root_module = b.createModule(.{ .optimize = b.standardOptimizeOption(.{}),
.target = target,
.optimize = optimize,
.link_libc = true,
.pic = if (shared) true else null,
}),
}); });
if (amalgamated) { lib.linkLibC();
lib.addCSourceFile(.{ lib.addCSourceFile(.{ .file = .{ .path = "lib/src/lib.c" }, .flags = &.{} });
.file = b.path("lib/src/lib.c"), lib.addIncludePath(.{ .path = "lib/include" });
.flags = &.{"-std=c11"}, lib.addIncludePath(.{ .path = "lib/src" });
});
} else {
const files = try findSourceFiles(b);
defer b.allocator.free(files);
lib.addCSourceFiles(.{
.root = b.path("lib/src"),
.files = files,
.flags = &.{"-std=c11"},
});
}
lib.addIncludePath(b.path("lib/include"));
lib.addIncludePath(b.path("lib/src"));
lib.addIncludePath(b.path("lib/src/wasm"));
lib.root_module.addCMacro("_POSIX_C_SOURCE", "200112L");
lib.root_module.addCMacro("_DEFAULT_SOURCE", "");
lib.root_module.addCMacro("_BSD_SOURCE", "");
lib.root_module.addCMacro("_DARWIN_C_SOURCE", "");
if (wasm) {
if (b.lazyDependency(wasmtimeDep(target.result), .{})) |wasmtime| {
lib.root_module.addCMacro("TREE_SITTER_FEATURE_WASM", "");
lib.addSystemIncludePath(wasmtime.path("include"));
lib.addLibraryPath(wasmtime.path("lib"));
if (shared) lib.linkSystemLibrary("wasmtime");
}
}
lib.installHeadersDirectory(b.path("lib/include"), ".", .{});
b.installArtifact(lib); b.installArtifact(lib);
} }
/// Get the name of the wasmtime dependency for this target.
pub fn wasmtimeDep(target: std.Target) []const u8 {
const arch = target.cpu.arch;
const os = target.os.tag;
const abi = target.abi;
return @as(?[]const u8, switch (os) {
.linux => switch (arch) {
.x86_64 => switch (abi) {
.gnu => "wasmtime_c_api_x86_64_linux",
.musl => "wasmtime_c_api_x86_64_musl",
.android => "wasmtime_c_api_x86_64_android",
else => null,
},
.aarch64 => switch (abi) {
.gnu => "wasmtime_c_api_aarch64_linux",
.musl => "wasmtime_c_api_aarch64_musl",
.android => "wasmtime_c_api_aarch64_android",
else => null,
},
.x86 => switch (abi) {
.gnu => "wasmtime_c_api_i686_linux",
else => null,
},
.arm => switch (abi) {
.gnueabi => "wasmtime_c_api_armv7_linux",
else => null,
},
.s390x => switch (abi) {
.gnu => "wasmtime_c_api_s390x_linux",
else => null,
},
.riscv64 => switch (abi) {
.gnu => "wasmtime_c_api_riscv64gc_linux",
else => null,
},
else => null,
},
.windows => switch (arch) {
.x86_64 => switch (abi) {
.gnu => "wasmtime_c_api_x86_64_mingw",
.msvc => "wasmtime_c_api_x86_64_windows",
else => null,
},
.aarch64 => switch (abi) {
.msvc => "wasmtime_c_api_aarch64_windows",
else => null,
},
.x86 => switch (abi) {
.msvc => "wasmtime_c_api_i686_windows",
else => null,
},
else => null,
},
.macos => switch (arch) {
.x86_64 => "wasmtime_c_api_x86_64_macos",
.aarch64 => "wasmtime_c_api_aarch64_macos",
else => null,
},
else => null,
}) orelse std.debug.panic(
"Unsupported target for wasmtime: {s}-{s}-{s}",
.{ @tagName(arch), @tagName(os), @tagName(abi) },
);
}
fn findSourceFiles(b: *std.Build) ![]const []const u8 {
var sources: std.ArrayListUnmanaged([]const u8) = .empty;
var dir = try b.build_root.handle.openDir("lib/src", .{ .iterate = true });
var iter = dir.iterate();
defer dir.close();
while (try iter.next()) |entry| {
if (entry.kind != .file) continue;
const file = entry.name;
const ext = std.fs.path.extension(file);
if (std.mem.eql(u8, ext, ".c") and !std.mem.eql(u8, file, "lib.c")) {
try sources.append(b.allocator, b.dupe(file));
}
}
return sources.toOwnedSlice(b.allocator);
}

View file

@ -1,96 +0,0 @@
.{
.name = .tree_sitter,
.fingerprint = 0x841224b447ac0d4f,
.version = "0.27.0",
.minimum_zig_version = "0.14.1",
.paths = .{
"build.zig",
"build.zig.zon",
"lib/src",
"lib/include",
"README.md",
"LICENSE",
},
.dependencies = .{
.wasmtime_c_api_aarch64_android = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-android-c-api.tar.xz",
.hash = "N-V-__8AAIfPIgdw2YnV3QyiFQ2NHdrxrXzzCdjYJyxJDOta",
.lazy = true,
},
.wasmtime_c_api_aarch64_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-linux-c-api.tar.xz",
.hash = "N-V-__8AAIt97QZi7Pf7nNJ2mVY6uxA80Klyuvvtop3pLMRK",
.lazy = true,
},
.wasmtime_c_api_aarch64_macos = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-macos-c-api.tar.xz",
.hash = "N-V-__8AAAO48QQf91w9RmmUDHTja8DrXZA1n6Bmc8waW3qe",
.lazy = true,
},
.wasmtime_c_api_aarch64_musl = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-musl-c-api.tar.xz",
.hash = "N-V-__8AAI196wa9pwADoA2RbCDp5F7bKQg1iOPq6gIh8-FH",
.lazy = true,
},
.wasmtime_c_api_aarch64_windows = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-aarch64-windows-c-api.zip",
.hash = "N-V-__8AAC9u4wXfqd1Q6XyQaC8_DbQZClXux60Vu5743N05",
.lazy = true,
},
.wasmtime_c_api_armv7_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-armv7-linux-c-api.tar.xz",
.hash = "N-V-__8AAHXe8gWs3s83Cc5G6SIq0_jWxj8fGTT5xG4vb6-x",
.lazy = true,
},
.wasmtime_c_api_i686_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-i686-linux-c-api.tar.xz",
.hash = "N-V-__8AAN2pzgUUfulRCYnipSfis9IIYHoTHVlieLRmKuct",
.lazy = true,
},
.wasmtime_c_api_i686_windows = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-i686-windows-c-api.zip",
.hash = "N-V-__8AAJu0YAUUTFBLxFIOi-MSQVezA6MMkpoFtuaf2Quf",
.lazy = true,
},
.wasmtime_c_api_riscv64gc_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-riscv64gc-linux-c-api.tar.xz",
.hash = "N-V-__8AAG8m-gc3E3AIImtTZ3l1c7HC6HUWazQ9OH5KACX4",
.lazy = true,
},
.wasmtime_c_api_s390x_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-s390x-linux-c-api.tar.xz",
.hash = "N-V-__8AAH314gd-gE4IBp2uvAL3gHeuW1uUZjMiLLeUdXL_",
.lazy = true,
},
.wasmtime_c_api_x86_64_android = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-android-c-api.tar.xz",
.hash = "N-V-__8AAIPNRwfNkznebrcGb0IKUe7f35bkuZEYOjcx6q3f",
.lazy = true,
},
.wasmtime_c_api_x86_64_linux = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-linux-c-api.tar.xz",
.hash = "N-V-__8AAI8EDwcyTtk_Afhk47SEaqfpoRqGkJeZpGs69ChF",
.lazy = true,
},
.wasmtime_c_api_x86_64_macos = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-macos-c-api.tar.xz",
.hash = "N-V-__8AAGtGNgVaOpHSxC22IjrampbRIy6lLwscdcAE8nG1",
.lazy = true,
},
.wasmtime_c_api_x86_64_mingw = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-mingw-c-api.zip",
.hash = "N-V-__8AAPS2PAbVix50L6lnddlgazCPTz3whLUFk1qnRtnZ",
.lazy = true,
},
.wasmtime_c_api_x86_64_musl = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-musl-c-api.tar.xz",
.hash = "N-V-__8AAF-WEQe0nzvi09PgusM5i46FIuCKJmIDWUleWgQ3",
.lazy = true,
},
.wasmtime_c_api_x86_64_windows = .{
.url = "https://github.com/bytecodealliance/wasmtime/releases/download/v33.0.2/wasmtime-v33.0.2-x86_64-windows-c-api.zip",
.hash = "N-V-__8AAKGNXwbpJQsn0_6kwSIVDDWifSg8cBzf7T2RzsC9",
.lazy = true,
},
},
}

View file

@ -1,24 +1,15 @@
[package] [package]
name = "tree-sitter-cli" name = "tree-sitter-cli"
version.workspace = true
description = "CLI tool for developing, testing, and using Tree-sitter parsers" description = "CLI tool for developing, testing, and using Tree-sitter parsers"
authors.workspace = true version = "0.20.9"
edition.workspace = true authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
rust-version.workspace = true edition = "2021"
license = "MIT"
readme = "README.md" readme = "README.md"
homepage.workspace = true keywords = ["incremental", "parsing"]
repository.workspace = true categories = ["command-line-utilities", "parsing"]
documentation = "https://docs.rs/tree-sitter-cli" repository = "https://github.com/tree-sitter/tree-sitter"
license.workspace = true rust-version.workspace = true
keywords.workspace = true
categories.workspace = true
include = ["build.rs", "README.md", "LICENSE", "benches/*", "src/**"]
[lints]
workspace = true
[lib]
path = "src/tree_sitter_cli.rs"
[[bin]] [[bin]]
name = "tree-sitter" name = "tree-sitter"
@ -30,54 +21,67 @@ name = "benchmark"
harness = false harness = false
[features] [features]
default = ["qjs-rt"]
wasm = ["tree-sitter/wasm", "tree-sitter-loader/wasm"] wasm = ["tree-sitter/wasm", "tree-sitter-loader/wasm"]
qjs-rt = ["tree-sitter-generate/qjs-rt"]
[dependencies] [dependencies]
ansi_colours.workspace = true ansi_term.workspace = true
anstyle.workspace = true anstyle.workspace = true
anyhow.workspace = true anyhow.workspace = true
bstr.workspace = true
clap.workspace = true clap.workspace = true
clap_complete.workspace = true
clap_complete_nushell.workspace = true
crc32fast.workspace = true
ctor.workspace = true
ctrlc.workspace = true ctrlc.workspace = true
dialoguer.workspace = true difference.workspace = true
dirs.workspace = true
glob.workspace = true glob.workspace = true
heck.workspace = true
html-escape.workspace = true html-escape.workspace = true
indoc.workspace = true indexmap.workspace = true
lazy_static.workspace = true
log.workspace = true log.workspace = true
memchr.workspace = true memchr.workspace = true
rand.workspace = true path-slash.workspace = true
regex.workspace = true regex.workspace = true
schemars.workspace = true regex-syntax.workspace = true
rustc-hash.workspace = true
semver.workspace = true semver.workspace = true
serde.workspace = true serde.workspace = true
serde_derive.workspace = true
serde_json.workspace = true serde_json.workspace = true
similar.workspace = true smallbitvec.workspace = true
streaming-iterator.workspace = true
thiserror.workspace = true
tiny_http.workspace = true tiny_http.workspace = true
toml.workspace = true
walkdir.workspace = true walkdir.workspace = true
wasmparser.workspace = true wasmparser.workspace = true
webbrowser.workspace = true webbrowser.workspace = true
which.workspace = true
tree-sitter.workspace = true [dependencies.tree-sitter]
tree-sitter-generate.workspace = true version = "0.20.10"
tree-sitter-config.workspace = true path = "../lib"
tree-sitter-highlight.workspace = true
tree-sitter-loader.workspace = true [dependencies.tree-sitter-config]
tree-sitter-tags.workspace = true version = "0.19.0"
path = "config"
[dependencies.tree-sitter-highlight]
version = "0.20.2"
path = "../highlight"
[dependencies.tree-sitter-loader]
version = "0.20.0"
path = "loader"
[dependencies.tree-sitter-tags]
version = "0.20.2"
path = "../tags"
[dev-dependencies] [dev-dependencies]
encoding_rs = "0.8.35"
widestring = "1.2.1"
tree_sitter_proc_macro = { path = "src/tests/proc_macro", package = "tree-sitter-tests-proc-macro" } tree_sitter_proc_macro = { path = "src/tests/proc_macro", package = "tree-sitter-tests-proc-macro" }
rand.workspace = true
tempfile.workspace = true tempfile.workspace = true
pretty_assertions.workspace = true pretty_assertions.workspace = true
ctor.workspace = true
unindent.workspace = true unindent.workspace = true
indoc.workspace = true
[build-dependencies]
toml.workspace = true

View file

@ -7,15 +7,14 @@
[npmjs.com]: https://www.npmjs.org/package/tree-sitter-cli [npmjs.com]: https://www.npmjs.org/package/tree-sitter-cli
[npmjs.com badge]: https://img.shields.io/npm/v/tree-sitter-cli.svg?color=%23BF4A4A [npmjs.com badge]: https://img.shields.io/npm/v/tree-sitter-cli.svg?color=%23BF4A4A
The Tree-sitter CLI allows you to develop, test, and use Tree-sitter grammars from the command line. It works on `MacOS`, The Tree-sitter CLI allows you to develop, test, and use Tree-sitter grammars from the command line. It works on MacOS, Linux, and Windows.
`Linux`, and `Windows`.
### Installation ### Installation
You can install the `tree-sitter-cli` with `cargo`: You can install the `tree-sitter-cli` with `cargo`:
```sh ```sh
cargo install --locked tree-sitter-cli cargo install tree-sitter-cli
``` ```
or with `npm`: or with `npm`:
@ -35,11 +34,9 @@ The `tree-sitter` binary itself has no dependencies, but specific commands have
### Commands ### Commands
* `generate` - The `tree-sitter generate` command will generate a Tree-sitter parser based on the grammar in the current * `generate` - The `tree-sitter generate` command will generate a Tree-sitter parser based on the grammar in the current working directory. See [the documentation] for more information.
working directory. See [the documentation] for more information.
* `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. * `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. See [the documentation] for more information.
See [the documentation] for more information.
* `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers. * `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers.

View file

@ -1,79 +1,68 @@
use std::{
collections::BTreeMap,
env, fs,
path::{Path, PathBuf},
str,
sync::LazyLock,
time::Instant,
};
use anyhow::Context; use anyhow::Context;
use log::info; use lazy_static::lazy_static;
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use std::time::Instant;
use std::{env, fs, str, usize};
use tree_sitter::{Language, Parser, Query}; use tree_sitter::{Language, Parser, Query};
use tree_sitter_loader::{CompileConfig, Loader}; use tree_sitter_loader::Loader;
include!("../src/tests/helpers/dirs.rs"); include!("../src/tests/helpers/dirs.rs");
static LANGUAGE_FILTER: LazyLock<Option<String>> = lazy_static! {
LazyLock::new(|| env::var("TREE_SITTER_BENCHMARK_LANGUAGE_FILTER").ok()); static ref LANGUAGE_FILTER: Option<String> =
static EXAMPLE_FILTER: LazyLock<Option<String>> = env::var("TREE_SITTER_BENCHMARK_LANGUAGE_FILTER").ok();
LazyLock::new(|| env::var("TREE_SITTER_BENCHMARK_EXAMPLE_FILTER").ok()); static ref EXAMPLE_FILTER: Option<String> =
static REPETITION_COUNT: LazyLock<usize> = LazyLock::new(|| { env::var("TREE_SITTER_BENCHMARK_EXAMPLE_FILTER").ok();
env::var("TREE_SITTER_BENCHMARK_REPETITION_COUNT") static ref REPETITION_COUNT: usize = env::var("TREE_SITTER_BENCHMARK_REPETITION_COUNT")
.map(|s| s.parse::<usize>().unwrap()) .map(|s| s.parse::<usize>().unwrap())
.unwrap_or(5) .unwrap_or(5);
}); static ref TEST_LOADER: Loader = Loader::with_parser_lib_path(SCRATCH_DIR.clone());
static TEST_LOADER: LazyLock<Loader> = static ref EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR: BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)> = {
LazyLock::new(|| Loader::with_parser_lib_path(SCRATCH_DIR.clone())); fn process_dir(result: &mut BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, dir: &Path) {
if dir.join("grammar.js").exists() {
let relative_path = dir.strip_prefix(GRAMMARS_DIR.as_path()).unwrap();
let (example_paths, query_paths) =
result.entry(relative_path.to_owned()).or_default();
#[allow(clippy::type_complexity)] if let Ok(example_files) = fs::read_dir(dir.join("examples")) {
static EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR: LazyLock< example_paths.extend(example_files.filter_map(|p| {
BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, let p = p.unwrap().path();
> = LazyLock::new(|| { if p.is_file() {
fn process_dir(result: &mut BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, dir: &Path) { Some(p)
if dir.join("grammar.js").exists() { } else {
let relative_path = dir.strip_prefix(GRAMMARS_DIR.as_path()).unwrap(); None
let (example_paths, query_paths) = result.entry(relative_path.to_owned()).or_default(); }
}));
}
if let Ok(example_files) = fs::read_dir(dir.join("examples")) { if let Ok(query_files) = fs::read_dir(dir.join("queries")) {
example_paths.extend(example_files.filter_map(|p| { query_paths.extend(query_files.filter_map(|p| {
let p = p.unwrap().path(); let p = p.unwrap().path();
if p.is_file() { if p.is_file() {
Some(p) Some(p)
} else { } else {
None None
}
}));
}
} else {
for entry in fs::read_dir(dir).unwrap() {
let entry = entry.unwrap().path();
if entry.is_dir() {
process_dir(result, &entry);
} }
}));
}
if let Ok(query_files) = fs::read_dir(dir.join("queries")) {
query_paths.extend(query_files.filter_map(|p| {
let p = p.unwrap().path();
if p.is_file() {
Some(p)
} else {
None
}
}));
}
} else {
for entry in fs::read_dir(dir).unwrap() {
let entry = entry.unwrap().path();
if entry.is_dir() {
process_dir(result, &entry);
} }
} }
} }
}
let mut result = BTreeMap::new(); let mut result = BTreeMap::new();
process_dir(&mut result, &GRAMMARS_DIR); process_dir(&mut result, &GRAMMARS_DIR);
result result
}); };
}
fn main() { fn main() {
tree_sitter_cli::logger::init();
let max_path_length = EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR let max_path_length = EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR
.values() .values()
.flat_map(|(e, q)| { .flat_map(|(e, q)| {
@ -84,7 +73,7 @@ fn main() {
.max() .max()
.unwrap_or(0); .unwrap_or(0);
info!("Benchmarking with {} repetitions", *REPETITION_COUNT); eprintln!("Benchmarking with {} repetitions", *REPETITION_COUNT);
let mut parser = Parser::new(); let mut parser = Parser::new();
let mut all_normal_speeds = Vec::new(); let mut all_normal_speeds = Vec::new();
@ -101,11 +90,11 @@ fn main() {
} }
} }
info!("\nLanguage: {language_name}"); eprintln!("\nLanguage: {language_name}");
let language = get_language(language_path); let language = get_language(language_path);
parser.set_language(&language).unwrap(); parser.set_language(&language).unwrap();
info!(" Constructing Queries"); eprintln!(" Constructing Queries");
for path in query_paths { for path in query_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() { if let Some(filter) = EXAMPLE_FILTER.as_ref() {
if !path.to_str().unwrap().contains(filter.as_str()) { if !path.to_str().unwrap().contains(filter.as_str()) {
@ -115,12 +104,12 @@ fn main() {
parse(path, max_path_length, |source| { parse(path, max_path_length, |source| {
Query::new(&language, str::from_utf8(source).unwrap()) Query::new(&language, str::from_utf8(source).unwrap())
.with_context(|| format!("Query file path: {}", path.display())) .with_context(|| format!("Query file path: {path:?}"))
.expect("Failed to parse query"); .expect("Failed to parse query");
}); });
} }
info!(" Parsing Valid Code:"); eprintln!(" Parsing Valid Code:");
let mut normal_speeds = Vec::new(); let mut normal_speeds = Vec::new();
for example_path in example_paths { for example_path in example_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() { if let Some(filter) = EXAMPLE_FILTER.as_ref() {
@ -134,7 +123,7 @@ fn main() {
})); }));
} }
info!(" Parsing Invalid Code (mismatched languages):"); eprintln!(" Parsing Invalid Code (mismatched languages):");
let mut error_speeds = Vec::new(); let mut error_speeds = Vec::new();
for (other_language_path, (example_paths, _)) in for (other_language_path, (example_paths, _)) in
EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR.iter() EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR.iter()
@ -155,30 +144,30 @@ fn main() {
} }
if let Some((average_normal, worst_normal)) = aggregate(&normal_speeds) { if let Some((average_normal, worst_normal)) = aggregate(&normal_speeds) {
info!(" Average Speed (normal): {average_normal} bytes/ms"); eprintln!(" Average Speed (normal): {average_normal} bytes/ms");
info!(" Worst Speed (normal): {worst_normal} bytes/ms"); eprintln!(" Worst Speed (normal): {worst_normal} bytes/ms");
} }
if let Some((average_error, worst_error)) = aggregate(&error_speeds) { if let Some((average_error, worst_error)) = aggregate(&error_speeds) {
info!(" Average Speed (errors): {average_error} bytes/ms"); eprintln!(" Average Speed (errors): {average_error} bytes/ms");
info!(" Worst Speed (errors): {worst_error} bytes/ms"); eprintln!(" Worst Speed (errors): {worst_error} bytes/ms");
} }
all_normal_speeds.extend(normal_speeds); all_normal_speeds.extend(normal_speeds);
all_error_speeds.extend(error_speeds); all_error_speeds.extend(error_speeds);
} }
info!("\n Overall"); eprintln!("\n Overall");
if let Some((average_normal, worst_normal)) = aggregate(&all_normal_speeds) { if let Some((average_normal, worst_normal)) = aggregate(&all_normal_speeds) {
info!(" Average Speed (normal): {average_normal} bytes/ms"); eprintln!(" Average Speed (normal): {average_normal} bytes/ms");
info!(" Worst Speed (normal): {worst_normal} bytes/ms"); eprintln!(" Worst Speed (normal): {worst_normal} bytes/ms");
} }
if let Some((average_error, worst_error)) = aggregate(&all_error_speeds) { if let Some((average_error, worst_error)) = aggregate(&all_error_speeds) {
info!(" Average Speed (errors): {average_error} bytes/ms"); eprintln!(" Average Speed (errors): {average_error} bytes/ms");
info!(" Worst Speed (errors): {worst_error} bytes/ms"); eprintln!(" Worst Speed (errors): {worst_error} bytes/ms");
} }
info!(""); eprintln!();
} }
fn aggregate(speeds: &[usize]) -> Option<(usize, usize)> { fn aggregate(speeds: &[usize]) -> Option<(usize, usize)> {
@ -197,8 +186,14 @@ fn aggregate(speeds: &[usize]) -> Option<(usize, usize)> {
} }
fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) -> usize { fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) -> usize {
eprint!(
" {:width$}\t",
path.file_name().unwrap().to_str().unwrap(),
width = max_path_length
);
let source_code = fs::read(path) let source_code = fs::read(path)
.with_context(|| format!("Failed to read {}", path.display())) .with_context(|| format!("Failed to read {path:?}"))
.unwrap(); .unwrap();
let time = Instant::now(); let time = Instant::now();
for _ in 0..*REPETITION_COUNT { for _ in 0..*REPETITION_COUNT {
@ -207,18 +202,17 @@ fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) ->
let duration = time.elapsed() / (*REPETITION_COUNT as u32); let duration = time.elapsed() / (*REPETITION_COUNT as u32);
let duration_ns = duration.as_nanos(); let duration_ns = duration.as_nanos();
let speed = ((source_code.len() as u128) * 1_000_000) / duration_ns; let speed = ((source_code.len() as u128) * 1_000_000) / duration_ns;
info!( eprintln!(
" {:max_path_length$}\ttime {:>7.2} ms\t\tspeed {speed:>6} bytes/ms", "time {:>7.2} ms\t\tspeed {speed:>6} bytes/ms",
path.file_name().unwrap().to_str().unwrap(),
(duration_ns as f64) / 1e6, (duration_ns as f64) / 1e6,
); );
speed as usize speed as usize
} }
fn get_language(path: &Path) -> Language { fn get_language(path: &Path) -> Language {
let src_path = GRAMMARS_DIR.join(path).join("src"); let src_dir = GRAMMARS_DIR.join(path).join("src");
TEST_LOADER TEST_LOADER
.load_language_at_path(CompileConfig::new(&src_path, None, None)) .load_language_at_path(&src_dir, &[&src_dir])
.with_context(|| format!("Failed to load language at path {}", src_path.display())) .with_context(|| format!("Failed to load language at path {src_dir:?}"))
.unwrap() .unwrap()
} }

116
cli/build.rs Normal file
View file

@ -0,0 +1,116 @@
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::{env, fs};
fn main() {
if let Some(git_sha) = read_git_sha() {
println!("cargo:rustc-env=BUILD_SHA={git_sha}");
}
if web_playground_files_present() {
println!("cargo:rustc-cfg=TREE_SITTER_EMBED_WASM_BINDING");
}
let rust_binding_version = read_rust_binding_version();
println!("cargo:rustc-env=RUST_BINDING_VERSION={rust_binding_version}");
}
fn web_playground_files_present() -> bool {
let paths = [
"../docs/assets/js/playground.js",
"../lib/binding_web/tree-sitter.js",
"../lib/binding_web/tree-sitter.wasm",
];
paths.iter().all(|p| Path::new(p).exists())
}
fn read_git_sha() -> Option<String> {
let mut repo_path = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
let mut git_path;
loop {
git_path = repo_path.join(".git");
if git_path.exists() {
break;
} else if !repo_path.pop() {
return None;
}
}
let git_dir_path;
if git_path.is_dir() {
git_dir_path = git_path;
} else if let Ok(git_path_content) = fs::read_to_string(&git_path) {
git_dir_path = repo_path.join(git_path_content.get("gitdir: ".len()..).unwrap().trim_end());
} else {
return None;
}
let git_head_path = git_dir_path.join("HEAD");
if let Some(path) = git_head_path.to_str() {
println!("cargo:rerun-if-changed={path}");
}
if let Ok(mut head_content) = fs::read_to_string(&git_head_path) {
if head_content.ends_with('\n') {
head_content.pop();
}
// If we're on a branch, read the SHA from the ref file.
if head_content.starts_with("ref: ") {
head_content.replace_range(0.."ref: ".len(), "");
let ref_filename = {
// Go to real non-worktree gitdir
let git_dir_path = git_dir_path
.parent()
.and_then(|p| {
p.file_name()
.map(|n| n == OsStr::new("worktrees"))
.and_then(|x| x.then(|| p.parent()))
})
.flatten()
.unwrap_or(&git_dir_path);
let file = git_dir_path.join(&head_content);
if file.is_file() {
file
} else {
let packed_refs = git_dir_path.join("packed-refs");
if let Ok(packed_refs_content) = fs::read_to_string(&packed_refs) {
for line in packed_refs_content.lines() {
if let Some((hash, r#ref)) = line.split_once(' ') {
if r#ref == head_content {
if let Some(path) = packed_refs.to_str() {
println!("cargo:rerun-if-changed={path}");
}
return Some(hash.to_string());
}
}
}
}
return None;
}
};
if let Some(path) = ref_filename.to_str() {
println!("cargo:rerun-if-changed={path}");
}
return fs::read_to_string(&ref_filename).ok();
}
// If we're on a detached commit, then the `HEAD` file itself contains the sha.
else if head_content.len() == 40 {
return Some(head_content);
}
}
None
}
fn read_rust_binding_version() -> String {
let path = "Cargo.toml";
let text = fs::read_to_string(path).unwrap();
let cargo_toml = toml::from_str::<toml::Value>(text.as_ref()).unwrap();
cargo_toml["dependencies"]["tree-sitter"]["version"]
.as_str()
.unwrap()
.trim_matches('"')
.to_string()
}

18
cli/config/Cargo.toml Normal file
View file

@ -0,0 +1,18 @@
[package]
name = "tree-sitter-config"
description = "User configuration of tree-sitter's command line programs"
version = "0.19.0"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2021"
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"]
repository = "https://github.com/tree-sitter/tree-sitter"
rust-version.workspace = true
[dependencies]
anyhow.workspace = true
dirs.workspace = true
serde.workspace = true
serde_json.workspace = true

View file

@ -1,54 +1,10 @@
#![cfg_attr(not(any(test, doctest)), doc = include_str!("../README.md"))] #![doc = include_str!("../README.md")]
use std::{ use anyhow::{anyhow, Context, Result};
env, fs,
path::{Path, PathBuf},
};
use etcetera::BaseStrategy as _;
use log::warn;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use thiserror::Error; use std::path::PathBuf;
use std::{env, fs};
pub type ConfigResult<T> = Result<T, ConfigError>;
#[derive(Debug, Error)]
pub enum ConfigError {
#[error("Bad JSON config {0} -- {1}")]
ConfigRead(String, serde_json::Error),
#[error(transparent)]
HomeDir(#[from] etcetera::HomeDirError),
#[error(transparent)]
IO(IoError),
#[error(transparent)]
Serialization(#[from] serde_json::Error),
}
#[derive(Debug, Error)]
pub struct IoError {
pub error: std::io::Error,
pub path: Option<String>,
}
impl IoError {
fn new(error: std::io::Error, path: Option<&Path>) -> Self {
Self {
error,
path: path.map(|p| p.to_string_lossy().to_string()),
}
}
}
impl std::fmt::Display for IoError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.error)?;
if let Some(ref path) = self.path {
write!(f, " ({path})")?;
}
Ok(())
}
}
/// Holds the contents of tree-sitter's configuration file. /// Holds the contents of tree-sitter's configuration file.
/// ///
@ -65,7 +21,7 @@ pub struct Config {
} }
impl Config { impl Config {
pub fn find_config_file() -> ConfigResult<Option<PathBuf>> { pub fn find_config_file() -> Result<Option<PathBuf>> {
if let Ok(path) = env::var("TREE_SITTER_DIR") { if let Ok(path) = env::var("TREE_SITTER_DIR") {
let mut path = PathBuf::from(path); let mut path = PathBuf::from(path);
path.push("config.json"); path.push("config.json");
@ -82,28 +38,8 @@ impl Config {
return Ok(Some(xdg_path)); return Ok(Some(xdg_path));
} }
if cfg!(target_os = "macos") { let legacy_path = dirs::home_dir()
let legacy_apple_path = etcetera::base_strategy::Apple::new()? .ok_or_else(|| anyhow!("Cannot determine home directory"))?
.data_dir() // `$HOME/Library/Application Support/`
.join("tree-sitter")
.join("config.json");
if legacy_apple_path.is_file() {
let xdg_dir = xdg_path.parent().unwrap();
fs::create_dir_all(xdg_dir)
.map_err(|e| ConfigError::IO(IoError::new(e, Some(xdg_dir))))?;
fs::rename(&legacy_apple_path, &xdg_path).map_err(|e| {
ConfigError::IO(IoError::new(e, Some(legacy_apple_path.as_path())))
})?;
warn!(
"Your config.json file has been automatically migrated from \"{}\" to \"{}\"",
legacy_apple_path.display(),
xdg_path.display()
);
return Ok(Some(xdg_path));
}
}
let legacy_path = etcetera::home_dir()?
.join(".tree-sitter") .join(".tree-sitter")
.join("config.json"); .join("config.json");
if legacy_path.is_file() { if legacy_path.is_file() {
@ -113,9 +49,9 @@ impl Config {
Ok(None) Ok(None)
} }
fn xdg_config_file() -> ConfigResult<PathBuf> { fn xdg_config_file() -> Result<PathBuf> {
let xdg_path = etcetera::choose_base_strategy()? let xdg_path = dirs::config_dir()
.config_dir() .ok_or_else(|| anyhow!("Cannot determine config directory"))?
.join("tree-sitter") .join("tree-sitter")
.join("config.json"); .join("config.json");
Ok(xdg_path) Ok(xdg_path)
@ -124,25 +60,19 @@ impl Config {
/// Locates and loads in the user's configuration file. We search for the configuration file /// Locates and loads in the user's configuration file. We search for the configuration file
/// in the following locations, in order: /// in the following locations, in order:
/// ///
/// - Location specified by the path parameter if provided
/// - `$TREE_SITTER_DIR/config.json`, if the `TREE_SITTER_DIR` environment variable is set /// - `$TREE_SITTER_DIR/config.json`, if the `TREE_SITTER_DIR` environment variable is set
/// - `tree-sitter/config.json` in your default user configuration directory, as determined by /// - `tree-sitter/config.json` in your default user configuration directory, as determined
/// [`etcetera::choose_base_strategy`](https://docs.rs/etcetera/*/etcetera/#basestrategy) /// by [`dirs::config_dir`](https://docs.rs/dirs/*/dirs/fn.config_dir.html)
/// - `$HOME/.tree-sitter/config.json` as a fallback from where tree-sitter _used_ to store /// - `$HOME/.tree-sitter/config.json` as a fallback from where tree-sitter _used_ to store
/// its configuration /// its configuration
pub fn load(path: Option<PathBuf>) -> ConfigResult<Self> { pub fn load() -> Result<Self> {
let location = if let Some(path) = path { let Some(location) = Self::find_config_file()? else {
path
} else if let Some(path) = Self::find_config_file()? {
path
} else {
return Self::initial(); return Self::initial();
}; };
let content = fs::read_to_string(&location) let content = fs::read_to_string(&location)
.map_err(|e| ConfigError::IO(IoError::new(e, Some(location.as_path()))))?; .with_context(|| format!("Failed to read {}", &location.to_string_lossy()))?;
let config = serde_json::from_str(&content) let config = serde_json::from_str(&content)
.map_err(|e| ConfigError::ConfigRead(location.to_string_lossy().to_string(), e))?; .with_context(|| format!("Bad JSON config {}", &location.to_string_lossy()))?;
Ok(Self { location, config }) Ok(Self { location, config })
} }
@ -152,7 +82,7 @@ impl Config {
/// disk. /// disk.
/// ///
/// (Note that this is typically only done by the `tree-sitter init-config` command.) /// (Note that this is typically only done by the `tree-sitter init-config` command.)
pub fn initial() -> ConfigResult<Self> { pub fn initial() -> Result<Self> {
let location = if let Ok(path) = env::var("TREE_SITTER_DIR") { let location = if let Ok(path) = env::var("TREE_SITTER_DIR") {
let mut path = PathBuf::from(path); let mut path = PathBuf::from(path);
path.push("config.json"); path.push("config.json");
@ -165,20 +95,17 @@ impl Config {
} }
/// Saves this configuration to the file that it was originally loaded from. /// Saves this configuration to the file that it was originally loaded from.
pub fn save(&self) -> ConfigResult<()> { pub fn save(&self) -> Result<()> {
let json = serde_json::to_string_pretty(&self.config)?; let json = serde_json::to_string_pretty(&self.config)?;
let config_dir = self.location.parent().unwrap(); fs::create_dir_all(self.location.parent().unwrap())?;
fs::create_dir_all(config_dir) fs::write(&self.location, json)?;
.map_err(|e| ConfigError::IO(IoError::new(e, Some(config_dir))))?;
fs::write(&self.location, json)
.map_err(|e| ConfigError::IO(IoError::new(e, Some(self.location.as_path()))))?;
Ok(()) Ok(())
} }
/// Parses a component-specific configuration from the configuration file. The type `C` must /// Parses a component-specific configuration from the configuration file. The type `C` must
/// be [deserializable](https://docs.rs/serde/*/serde/trait.Deserialize.html) from a JSON /// be [deserializable](https://docs.rs/serde/*/serde/trait.Deserialize.html) from a JSON
/// object, and must only include the fields relevant to that component. /// object, and must only include the fields relevant to that component.
pub fn get<C>(&self) -> ConfigResult<C> pub fn get<C>(&self) -> Result<C>
where where
C: for<'de> Deserialize<'de>, C: for<'de> Deserialize<'de>,
{ {
@ -189,7 +116,7 @@ impl Config {
/// Adds a component-specific configuration to the configuration file. The type `C` must be /// Adds a component-specific configuration to the configuration file. The type `C` must be
/// [serializable](https://docs.rs/serde/*/serde/trait.Serialize.html) into a JSON object, and /// [serializable](https://docs.rs/serde/*/serde/trait.Serialize.html) into a JSON object, and
/// must only include the fields relevant to that component. /// must only include the fields relevant to that component.
pub fn add<C>(&mut self, config: C) -> ConfigResult<()> pub fn add<C>(&mut self, config: C) -> Result<()>
where where
C: Serialize, C: Serialize,
{ {

38
cli/loader/Cargo.toml Normal file
View file

@ -0,0 +1,38 @@
[package]
name = "tree-sitter-loader"
description = "Locates, builds, and loads tree-sitter grammars at runtime"
version = "0.20.0"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2021"
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"]
repository = "https://github.com/tree-sitter/tree-sitter"
rust-version.workspace = true
[features]
wasm = ["tree-sitter/wasm"]
[dependencies]
anyhow.workspace = true
cc.workspace = true
dirs.workspace = true
libloading.workspace = true
once_cell.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
which.workspace = true
[dependencies.tree-sitter]
version = "0.20.10"
path = "../../lib"
[dependencies.tree-sitter-highlight]
version = "0.20.2"
path = "../../highlight"
[dependencies.tree-sitter-tags]
version = "0.20.2"
path = "../../tags"

9
cli/loader/build.rs Normal file
View file

@ -0,0 +1,9 @@
fn main() {
println!(
"cargo:rustc-env=BUILD_TARGET={}",
std::env::var("TARGET").unwrap()
);
let emscripten_version = std::fs::read_to_string("emscripten-version").unwrap();
println!("cargo:rustc-env=EMSCRIPTEN_VERSION={emscripten_version}");
}

View file

@ -0,0 +1 @@
3.1.37

1190
cli/loader/src/lib.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,6 @@ type PrecRightRule = { type: 'PREC_RIGHT'; content: Rule; value: number };
type PrecRule = { type: 'PREC'; content: Rule; value: number }; type PrecRule = { type: 'PREC'; content: Rule; value: number };
type Repeat1Rule = { type: 'REPEAT1'; content: Rule }; type Repeat1Rule = { type: 'REPEAT1'; content: Rule };
type RepeatRule = { type: 'REPEAT'; content: Rule }; type RepeatRule = { type: 'REPEAT'; content: Rule };
type ReservedRule = { type: 'RESERVED'; content: Rule; context_name: string };
type SeqRule = { type: 'SEQ'; members: Rule[] }; type SeqRule = { type: 'SEQ'; members: Rule[] };
type StringRule = { type: 'STRING'; value: string }; type StringRule = { type: 'STRING'; value: string };
type SymbolRule<Name extends string> = { type: 'SYMBOL'; name: Name }; type SymbolRule<Name extends string> = { type: 'SYMBOL'; name: Name };
@ -29,19 +28,12 @@ type Rule =
| PrecRule | PrecRule
| Repeat1Rule | Repeat1Rule
| RepeatRule | RepeatRule
| ReservedRule
| SeqRule | SeqRule
| StringRule | StringRule
| SymbolRule<string> | SymbolRule<string>
| TokenRule; | TokenRule;
declare class RustRegex { type RuleOrLiteral = Rule | RegExp | string;
value: string;
constructor(pattern: string);
}
type RuleOrLiteral = Rule | RegExp | RustRegex | string;
type GrammarSymbols<RuleName extends string> = { type GrammarSymbols<RuleName extends string> = {
[name in RuleName]: SymbolRule<name>; [name in RuleName]: SymbolRule<name>;
@ -50,7 +42,7 @@ type GrammarSymbols<RuleName extends string> = {
type RuleBuilder<RuleName extends string> = ( type RuleBuilder<RuleName extends string> = (
$: GrammarSymbols<RuleName>, $: GrammarSymbols<RuleName>,
previous?: Rule, previous: Rule,
) => RuleOrLiteral; ) => RuleOrLiteral;
type RuleBuilders< type RuleBuilders<
@ -113,7 +105,7 @@ interface Grammar<
* @param $ grammar rules * @param $ grammar rules
* @param previous array of externals from the base schema, if any * @param previous array of externals from the base schema, if any
* *
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers/4-external-scanners * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
*/ */
externals?: ( externals?: (
$: Record<string, SymbolRule<string>>, $: Record<string, SymbolRule<string>>,
@ -151,7 +143,7 @@ interface Grammar<
* *
* @param $ grammar rules * @param $ grammar rules
* *
* @see https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types * @see https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
*/ */
supertypes?: ( supertypes?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>, $: GrammarSymbols<RuleName | BaseGrammarRuleName>,
@ -164,20 +156,9 @@ interface Grammar<
* *
* @param $ grammar rules * @param $ grammar rules
* *
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers/3-writing-the-grammar#keyword-extraction * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction
*/ */
word?: ($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral; word?: ($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral;
/**
* Mapping of names to reserved word sets. The first reserved word set is the
* global word set, meaning it applies to every rule in every parse state.
* The other word sets can be used with the `reserved` function.
*/
reserved?: Record<
string,
($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral[]
>;
} }
type GrammarSchema<RuleName extends string> = { type GrammarSchema<RuleName extends string> = {
@ -262,7 +243,7 @@ declare function optional(rule: RuleOrLiteral): ChoiceRule;
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/ */
declare const prec: { declare const prec: {
(value: string | number, rule: RuleOrLiteral): PrecRule; (value: String | number, rule: RuleOrLiteral): PrecRule;
/** /**
* Marks the given rule as left-associative (and optionally applies a * Marks the given rule as left-associative (and optionally applies a
@ -278,7 +259,7 @@ declare const prec: {
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/ */
left(rule: RuleOrLiteral): PrecLeftRule; left(rule: RuleOrLiteral): PrecLeftRule;
left(value: string | number, rule: RuleOrLiteral): PrecLeftRule; left(value: String | number, rule: RuleOrLiteral): PrecLeftRule;
/** /**
* Marks the given rule as right-associative (and optionally applies a * Marks the given rule as right-associative (and optionally applies a
@ -294,7 +275,7 @@ declare const prec: {
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/ */
right(rule: RuleOrLiteral): PrecRightRule; right(rule: RuleOrLiteral): PrecRightRule;
right(value: string | number, rule: RuleOrLiteral): PrecRightRule; right(value: String | number, rule: RuleOrLiteral): PrecRightRule;
/** /**
* Marks the given rule with a numerical precedence which will be used to * Marks the given rule with a numerical precedence which will be used to
@ -311,7 +292,7 @@ declare const prec: {
* *
* @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html * @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html
*/ */
dynamic(value: string | number, rule: RuleOrLiteral): PrecDynamicRule; dynamic(value: String | number, rule: RuleOrLiteral): PrecDynamicRule;
}; };
/** /**
@ -331,15 +312,6 @@ declare function repeat(rule: RuleOrLiteral): RepeatRule;
*/ */
declare function repeat1(rule: RuleOrLiteral): Repeat1Rule; declare function repeat1(rule: RuleOrLiteral): Repeat1Rule;
/**
* Overrides the global reserved word set for a given rule. The word set name
* should be defined in the `reserved` field in the grammar.
*
* @param wordset name of the reserved word set
* @param rule rule that will use the reserved word set
*/
declare function reserved(wordset: string, rule: RuleOrLiteral): ReservedRule;
/** /**
* Creates a rule that matches any number of other rules, one after another. * Creates a rule that matches any number of other rules, one after another.
* It is analogous to simply writing multiple symbols next to each other * It is analogous to simply writing multiple symbols next to each other
@ -358,7 +330,7 @@ declare function sym<Name extends string>(name: Name): SymbolRule<Name>;
/** /**
* Marks the given rule as producing only a single token. Tree-sitter's * Marks the given rule as producing only a single token. Tree-sitter's
* default is to treat each string or RegExp literal in the grammar as a * default is to treat each String or RegExp literal in the grammar as a
* separate token. Each token is matched separately by the lexer and * separate token. Each token is matched separately by the lexer and
* returned as its own leaf node in the tree. The token function allows * returned as its own leaf node in the tree. The token function allows
* you to express a complex rule using the DSL functions (rather * you to express a complex rule using the DSL functions (rather

22
crates/cli/npm/install.js → cli/npm/install.js Normal file → Executable file
View file

@ -6,8 +6,7 @@ const http = require('http');
const https = require('https'); const https = require('https');
const packageJSON = require('./package.json'); const packageJSON = require('./package.json');
https.globalAgent.keepAlive = false; // Look to a results table in https://github.com/tree-sitter/tree-sitter/issues/2196
const matrix = { const matrix = {
platform: { platform: {
'darwin': { 'darwin': {
@ -41,7 +40,7 @@ const matrix = {
// Determine the URL of the file. // Determine the URL of the file.
const platform = matrix.platform[process.platform]; const platform = matrix.platform[process.platform];
const arch = platform?.arch[process.arch]; const arch = platform && platform.arch[process.arch];
if (!platform || !platform.name || !arch || !arch.name) { if (!platform || !platform.name || !arch || !arch.name) {
console.error( console.error(
@ -92,7 +91,7 @@ function get(url, callback) {
} }
}; };
const proxyEnv = process.env.HTTPS_PROXY || process.env.https_proxy; const proxyEnv = process.env['HTTPS_PROXY'] || process.env['https_proxy'];
if (!proxyEnv) { if (!proxyEnv) {
https.get(url, processResponse); https.get(url, processResponse);
return; return;
@ -102,23 +101,12 @@ function get(url, callback) {
const requestPort = requestUrl.port || (requestUrl.protocol === 'https:' ? 443 : 80); const requestPort = requestUrl.port || (requestUrl.protocol === 'https:' ? 443 : 80);
const proxyUrl = new URL(proxyEnv); const proxyUrl = new URL(proxyEnv);
const request = proxyUrl.protocol === 'https:' ? https : http; const request = proxyUrl.protocol === 'https:' ? https : http;
const requestOption = { request.request({
host: proxyUrl.hostname, host: proxyUrl.hostname,
port: proxyUrl.port || (proxyUrl.protocol === 'https:' ? 443 : 80), port: proxyUrl.port || (proxyUrl.protocol === 'https:' ? 443 : 80),
method: 'CONNECT', method: 'CONNECT',
path: `${requestUrl.hostname}:${requestPort}`, path: `${requestUrl.hostname}:${requestPort}`,
}; }).on('connect', (response, socket, _head) => {
if (proxyUrl.username || proxyUrl.password) {
const auth = `${decodeURIComponent(
proxyUrl.username
)}:${decodeURIComponent(proxyUrl.password)}`;
requestOption.headers = {
'Proxy-Authorization': `Basic ${Buffer.from(
auth
).toString('base64')}`,
}
}
request.request(requestOption).on('connect', (response, socket, _head) => {
if (response.statusCode !== 200) { if (response.statusCode !== 200) {
// let caller handle error // let caller handle error
callback(response); callback(response);

24
cli/npm/package.json Normal file
View file

@ -0,0 +1,24 @@
{
"name": "tree-sitter-cli",
"version": "0.20.9",
"author": "Max Brunsfeld",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/tree-sitter/tree-sitter.git"
},
"description": "CLI for generating fast incremental parsers",
"keywords": [
"parser",
"lexer"
],
"main": "lib/api/index.js",
"scripts": {
"install": "node install.js",
"prepack": "cp ../../LICENSE ../README.md .",
"postpack": "rm LICENSE README.md"
},
"bin": {
"tree-sitter": "cli.js"
}
}

View file

@ -0,0 +1,153 @@
use super::write_file;
use anyhow::{Context, Result};
use std::path::{Path, PathBuf};
use std::{fs, str};
const BINDING_CC_TEMPLATE: &str = include_str!("./templates/binding.cc");
const BINDING_GYP_TEMPLATE: &str = include_str!("./templates/binding.gyp");
const INDEX_JS_TEMPLATE: &str = include_str!("./templates/index.js");
const LIB_RS_TEMPLATE: &str = include_str!("./templates/lib.rs");
const BUILD_RS_TEMPLATE: &str = include_str!("./templates/build.rs");
const CARGO_TOML_TEMPLATE: &str = include_str!("./templates/cargo.toml");
const PACKAGE_JSON_TEMPLATE: &str = include_str!("./templates/package.json");
const PARSER_NAME_PLACEHOLDER: &str = "PARSER_NAME";
const CLI_VERSION_PLACEHOLDER: &str = "CLI_VERSION";
const CLI_VERSION: &str = env!("CARGO_PKG_VERSION");
const RUST_BINDING_VERSION: &str = env!("RUST_BINDING_VERSION");
const RUST_BINDING_VERSION_PLACEHOLDER: &str = "RUST_BINDING_VERSION";
pub fn generate_binding_files(repo_path: &Path, language_name: &str) -> Result<()> {
let bindings_dir = repo_path.join("bindings");
let dashed_language_name = language_name.replace('_', "-");
let dashed_language_name = dashed_language_name.as_str();
// Generate rust bindings if needed.
let rust_binding_dir = bindings_dir.join("rust");
create_path(&rust_binding_dir, |path| create_dir(path))?;
create_path(&rust_binding_dir.join("lib.rs"), |path| {
generate_file(path, LIB_RS_TEMPLATE, language_name)
})?;
create_path(&rust_binding_dir.join("build.rs"), |path| {
generate_file(path, BUILD_RS_TEMPLATE, language_name)
})?;
create_path(&repo_path.join("Cargo.toml"), |path| {
generate_file(path, CARGO_TOML_TEMPLATE, dashed_language_name)
})?;
// Generate node bindings
let node_binding_dir = bindings_dir.join("node");
create_path(&node_binding_dir, |path| create_dir(path))?;
create_path(&node_binding_dir.join("index.js"), |path| {
generate_file(path, INDEX_JS_TEMPLATE, language_name)
})?;
create_path(&node_binding_dir.join("binding.cc"), |path| {
generate_file(path, BINDING_CC_TEMPLATE, language_name)
})?;
// Create binding.gyp, or update it with new binding path.
let binding_gyp_path = repo_path.join("binding.gyp");
create_path_else(
&binding_gyp_path,
|path| generate_file(path, BINDING_GYP_TEMPLATE, language_name),
|path| {
let binding_gyp =
fs::read_to_string(path).with_context(|| "Failed to read binding.gyp")?;
let old_path = "\"src/binding.cc\"";
if binding_gyp.contains(old_path) {
eprintln!("Updating binding.gyp with new binding path");
let binding_gyp = binding_gyp.replace(old_path, "\"bindings/node/binding.cc\"");
write_file(path, binding_gyp)?;
}
Ok(())
},
)?;
// Create package.json, or update it with new binding path.
let package_json_path = repo_path.join("package.json");
create_path_else(
&package_json_path,
|path| generate_file(path, PACKAGE_JSON_TEMPLATE, dashed_language_name),
|path| {
let package_json_str =
fs::read_to_string(path).with_context(|| "Failed to read package.json")?;
let mut package_json =
serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&package_json_str,
)
.with_context(|| "Failed to parse package.json")?;
let package_json_main = package_json.get("main");
let package_json_needs_update = package_json_main.map_or(true, |v| {
let main_string = v.as_str();
main_string == Some("index.js") || main_string == Some("./index.js")
});
if package_json_needs_update {
eprintln!("Updating package.json with new binding path");
package_json.insert(
"main".to_string(),
serde_json::Value::String("bindings/node".to_string()),
);
let mut package_json_str = serde_json::to_string_pretty(&package_json)?;
package_json_str.push('\n');
write_file(path, package_json_str)?;
}
Ok(())
},
)?;
// Remove files from old node binding paths.
let old_index_js_path = repo_path.join("index.js");
let old_binding_cc_path = repo_path.join("src").join("binding.cc");
if old_index_js_path.exists() {
fs::remove_file(old_index_js_path).ok();
}
if old_binding_cc_path.exists() {
fs::remove_file(old_binding_cc_path).ok();
}
Ok(())
}
fn generate_file(path: &Path, template: &str, language_name: &str) -> Result<()> {
write_file(
path,
template
.replace(PARSER_NAME_PLACEHOLDER, language_name)
.replace(CLI_VERSION_PLACEHOLDER, CLI_VERSION)
.replace(RUST_BINDING_VERSION_PLACEHOLDER, RUST_BINDING_VERSION),
)
}
fn create_dir(path: &Path) -> Result<()> {
fs::create_dir_all(path)
.with_context(|| format!("Failed to create {:?}", path.to_string_lossy()))
}
fn create_path<F>(path: &PathBuf, action: F) -> Result<bool>
where
F: Fn(&PathBuf) -> Result<()>,
{
if !path.exists() {
action(path)?;
return Ok(true);
}
Ok(false)
}
fn create_path_else<T, F>(path: &PathBuf, action: T, else_action: F) -> Result<bool>
where
T: Fn(&PathBuf) -> Result<()>,
F: Fn(&PathBuf) -> Result<()>,
{
if !path.exists() {
action(path)?;
return Ok(true);
}
else_action(path)?;
Ok(false)
}

View file

@ -1,26 +1,14 @@
use std::{ use super::coincident_tokens::CoincidentTokenIndex;
collections::{hash_map::Entry, HashMap, VecDeque}, use super::token_conflicts::TokenConflictMap;
mem, use crate::generate::dedup::split_state_id_groups;
}; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::NfaCursor;
use log::debug; use crate::generate::rules::{Symbol, TokenSet};
use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
use super::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap}; use log::info;
use crate::{ use std::collections::hash_map::Entry;
dedup::split_state_id_groups, use std::collections::{HashMap, VecDeque};
grammars::{LexicalGrammar, SyntaxGrammar}, use std::mem;
nfa::{CharacterSet, NfaCursor},
rules::{Symbol, TokenSet},
tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable},
};
pub const LARGE_CHARACTER_RANGE_COUNT: usize = 8;
pub struct LexTables {
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
pub fn build_lex_table( pub fn build_lex_table(
parse_table: &mut ParseTable, parse_table: &mut ParseTable,
@ -29,7 +17,7 @@ pub fn build_lex_table(
keywords: &TokenSet, keywords: &TokenSet,
coincident_token_index: &CoincidentTokenIndex, coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap, token_conflict_map: &TokenConflictMap,
) -> LexTables { ) -> (LexTable, LexTable) {
let keyword_lex_table = if syntax_grammar.word_token.is_some() { let keyword_lex_table = if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar); let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords); builder.add_state_for_tokens(keywords);
@ -38,22 +26,20 @@ pub fn build_lex_table(
LexTable::default() LexTable::default()
}; };
let mut parse_state_ids_by_token_set = Vec::<(TokenSet, Vec<ParseStateId>)>::new(); let mut parse_state_ids_by_token_set: Vec<(TokenSet, Vec<ParseStateId>)> = Vec::new();
for (i, state) in parse_table.states.iter().enumerate() { for (i, state) in parse_table.states.iter().enumerate() {
let tokens = state let tokens = state
.terminal_entries .terminal_entries
.keys() .keys()
.copied()
.chain(state.reserved_words.iter())
.filter_map(|token| { .filter_map(|token| {
if token.is_terminal() { if token.is_terminal() {
if keywords.contains(&token) { if keywords.contains(token) {
syntax_grammar.word_token syntax_grammar.word_token
} else { } else {
Some(token) Some(*token)
} }
} else if token.is_eof() { } else if token.is_eof() {
Some(token) Some(*token)
} else { } else {
None None
} }
@ -88,45 +74,10 @@ pub fn build_lex_table(
} }
} }
let mut main_lex_table = mem::take(&mut builder.table); let mut table = builder.table;
minimize_lex_table(&mut main_lex_table, parse_table); minimize_lex_table(&mut table, parse_table);
sort_states(&mut main_lex_table, parse_table); sort_states(&mut table, parse_table);
(table, keyword_lex_table)
let mut large_character_sets = Vec::new();
for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() {
let symbol = Symbol::terminal(variable_ix);
builder.reset();
builder.add_state_for_tokens(&TokenSet::from_iter([symbol]));
for state in &builder.table.states {
let mut characters = CharacterSet::empty();
for (chars, action) in &state.advance_actions {
if action.in_main_token {
characters = characters.add(chars);
continue;
}
if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT
&& !large_character_sets.iter().any(|(_, set)| set == chars)
{
large_character_sets.push((None, chars.clone()));
}
}
if characters.range_count() > LARGE_CHARACTER_RANGE_COUNT
&& !large_character_sets
.iter()
.any(|(_, set)| *set == characters)
{
large_character_sets.push((Some(symbol), characters));
}
}
}
LexTables {
main_lex_table,
keyword_lex_table,
large_character_sets,
}
} }
struct QueueEntry { struct QueueEntry {
@ -154,12 +105,6 @@ impl<'a> LexTableBuilder<'a> {
} }
} }
fn reset(&mut self) {
self.table = LexTable::default();
self.state_queue.clear();
self.state_ids_by_nfa_state_set.clear();
}
fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize { fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize {
let mut eof_valid = false; let mut eof_valid = false;
let nfa_states = tokens let nfa_states = tokens
@ -176,8 +121,9 @@ impl<'a> LexTableBuilder<'a> {
let (state_id, is_new) = self.add_state(nfa_states, eof_valid); let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
if is_new { if is_new {
debug!( info!(
"entry point state: {state_id}, tokens: {:?}", "entry point state: {}, tokens: {:?}",
state_id,
tokens tokens
.iter() .iter()
.map(|t| &self.lexical_grammar.variables[t.index].name) .map(|t| &self.lexical_grammar.variables[t.index].name)
@ -358,7 +304,9 @@ fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
&mut group_ids_by_state_id, &mut group_ids_by_state_id,
1, 1,
lex_states_differ, lex_states_differ,
) {} ) {
continue;
}
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len()); let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id { for state_ids in &state_ids_by_group_id {

View file

@ -1,28 +1,24 @@
use std::{ use super::item::{ParseItem, ParseItemSet, ParseItemSetCore};
cmp::Ordering, use super::item_set_builder::ParseItemSetBuilder;
collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque}, use crate::generate::grammars::PrecedenceEntry;
hash::BuildHasherDefault, use crate::generate::grammars::{
InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType,
}; };
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{
FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
ProductionInfo, ProductionInfoId,
};
use anyhow::{anyhow, Result};
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt::Write;
use std::hash::BuildHasherDefault;
use std::u32;
use indexmap::{map::Entry, IndexMap}; use indexmap::{map::Entry, IndexMap};
use log::warn;
use rustc_hash::FxHasher; use rustc_hash::FxHasher;
use serde::Serialize;
use thiserror::Error;
use super::{
item::{ParseItem, ParseItemSet, ParseItemSetCore, ParseItemSetEntry},
item_set_builder::ParseItemSetBuilder,
};
use crate::{
grammars::{LexicalGrammar, PrecedenceEntry, ReservedWordSetId, SyntaxGrammar, VariableType},
node_types::VariableInfo,
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
tables::{
FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable,
ParseTableEntry, ProductionInfo, ProductionInfoId,
},
};
// For conflict reporting, each parse state is associated with an example // For conflict reporting, each parse state is associated with an example
// sequence of symbols that could lead to that parse state. // sequence of symbols that could lead to that parse state.
@ -31,7 +27,7 @@ type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>; type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>); pub type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
#[derive(Clone, PartialEq)] #[derive(Clone)]
struct AuxiliarySymbolInfo { struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol, auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>, parent_symbols: Vec<Symbol>,
@ -65,208 +61,8 @@ struct ParseTableBuilder<'a> {
parse_table: ParseTable, parse_table: ParseTable,
} }
pub type BuildTableResult<T> = Result<T, ParseTableBuilderError>;
#[derive(Debug, Error, Serialize)]
pub enum ParseTableBuilderError {
#[error("Unresolved conflict for symbol sequence:\n\n{0}")]
Conflict(#[from] ConflictError),
#[error("Extra rules must have unambiguous endings. Conflicting rules: {0}")]
AmbiguousExtra(#[from] AmbiguousExtraError),
#[error(
"The non-terminal rule `{0}` is used in a non-terminal `extra` rule, which is not allowed."
)]
ImproperNonTerminalExtra(String),
#[error("State count `{0}` exceeds the max value {max}.", max=u16::MAX)]
StateCount(usize),
}
#[derive(Default, Debug, Serialize, Error)]
pub struct ConflictError {
pub symbol_sequence: Vec<String>,
pub conflicting_lookahead: String,
pub possible_interpretations: Vec<Interpretation>,
pub possible_resolutions: Vec<Resolution>,
}
#[derive(Default, Debug, Serialize, Error)]
pub struct Interpretation {
pub preceding_symbols: Vec<String>,
pub variable_name: String,
pub production_step_symbols: Vec<String>,
pub step_index: u32,
pub done: bool,
pub conflicting_lookahead: String,
pub precedence: Option<String>,
pub associativity: Option<String>,
}
#[derive(Debug, Serialize)]
pub enum Resolution {
Precedence { symbols: Vec<String> },
Associativity { symbols: Vec<String> },
AddConflict { symbols: Vec<String> },
}
#[derive(Debug, Serialize, Error)]
pub struct AmbiguousExtraError {
pub parent_symbols: Vec<String>,
}
impl std::fmt::Display for ConflictError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for symbol in &self.symbol_sequence {
write!(f, " {symbol}")?;
}
writeln!(f, " • {} …\n", self.conflicting_lookahead)?;
writeln!(f, "Possible interpretations:\n")?;
let mut interpretations = self
.possible_interpretations
.iter()
.map(|i| {
let line = i.to_string();
let prec_line = if let (Some(precedence), Some(associativity)) =
(&i.precedence, &i.associativity)
{
Some(format!(
"(precedence: {precedence}, associativity: {associativity})",
))
} else {
i.precedence
.as_ref()
.map(|precedence| format!("(precedence: {precedence})"))
};
(line, prec_line)
})
.collect::<Vec<_>>();
let max_interpretation_length = interpretations
.iter()
.map(|i| i.0.chars().count())
.max()
.unwrap();
interpretations.sort_unstable();
for (i, (line, prec_suffix)) in interpretations.into_iter().enumerate() {
write!(f, " {}:", i + 1).unwrap();
write!(f, "{line}")?;
if let Some(prec_suffix) = prec_suffix {
write!(
f,
"{:1$}",
"",
max_interpretation_length.saturating_sub(line.chars().count()) + 2
)?;
write!(f, "{prec_suffix}")?;
}
writeln!(f)?;
}
writeln!(f, "\nPossible resolutions:\n")?;
for (i, resolution) in self.possible_resolutions.iter().enumerate() {
writeln!(f, " {}: {resolution}", i + 1)?;
}
Ok(())
}
}
impl std::fmt::Display for Interpretation {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for symbol in &self.preceding_symbols {
write!(f, " {symbol}")?;
}
write!(f, " ({}", self.variable_name)?;
for (i, symbol) in self.production_step_symbols.iter().enumerate() {
if i == self.step_index as usize {
write!(f, "")?;
}
write!(f, " {symbol}")?;
}
write!(f, ")")?;
if self.done {
write!(f, " • {} …", self.conflicting_lookahead)?;
}
Ok(())
}
}
impl std::fmt::Display for Resolution {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Precedence { symbols } => {
write!(f, "Specify a higher precedence in ")?;
for (i, symbol) in symbols.iter().enumerate() {
if i > 0 {
write!(f, " and ")?;
}
write!(f, "`{symbol}`")?;
}
write!(f, " than in the other rules.")?;
}
Self::Associativity { symbols } => {
write!(f, "Specify a left or right associativity in ")?;
for (i, symbol) in symbols.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "`{symbol}`")?;
}
}
Self::AddConflict { symbols } => {
write!(f, "Add a conflict for these rules: ")?;
for (i, symbol) in symbols.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "`{symbol}`")?;
}
}
}
Ok(())
}
}
impl std::fmt::Display for AmbiguousExtraError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
for (i, symbol) in self.parent_symbols.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{symbol}")?;
}
Ok(())
}
}
impl<'a> ParseTableBuilder<'a> { impl<'a> ParseTableBuilder<'a> {
fn new( fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
item_set_builder: ParseItemSetBuilder<'a>,
variable_info: &'a [VariableInfo],
) -> Self {
Self {
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
actual_conflicts: syntax_grammar.expected_conflicts.iter().cloned().collect(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
}
fn build(mut self) -> BuildTableResult<(ParseTable, Vec<ParseStateInfo<'a>>)> {
// Ensure that the empty alias sequence has index 0. // Ensure that the empty alias sequence has index 0.
self.parse_table self.parse_table
.production_infos .production_infos
@ -279,13 +75,10 @@ impl<'a> ParseTableBuilder<'a> {
self.add_parse_state( self.add_parse_state(
&Vec::new(), &Vec::new(),
&Vec::new(), &Vec::new(),
ParseItemSet { ParseItemSet::with(std::iter::once((
entries: vec![ParseItemSetEntry { ParseItem::start(),
item: ParseItem::start(), std::iter::once(&Symbol::end()).copied().collect(),
lookaheads: std::iter::once(Symbol::end()).collect(), ))),
following_reserved_word_set: ReservedWordSetId::default(),
}],
},
); );
// Compute the possible item sets for non-terminal extras. // Compute the possible item sets for non-terminal extras.
@ -301,35 +94,25 @@ impl<'a> ParseTableBuilder<'a> {
non_terminal_extra_item_sets_by_first_terminal non_terminal_extra_item_sets_by_first_terminal
.entry(production.first_symbol().unwrap()) .entry(production.first_symbol().unwrap())
.or_insert_with(ParseItemSet::default) .or_insert_with(ParseItemSet::default)
.insert(ParseItem { .insert(
variable_index: extra_non_terminal.index as u32, ParseItem {
production, variable_index: extra_non_terminal.index as u32,
step_index: 1, production,
has_preceding_inherited_fields: false, step_index: 1,
}) has_preceding_inherited_fields: false,
.lookaheads },
.insert(Symbol::end_of_nonterminal_extra()); &std::iter::once(&Symbol::end_of_nonterminal_extra())
.copied()
.collect(),
);
} }
} }
let non_terminal_sets_len = non_terminal_extra_item_sets_by_first_terminal.len();
self.non_terminal_extra_states
.reserve(non_terminal_sets_len);
self.parse_state_info_by_id.reserve(non_terminal_sets_len);
self.parse_table.states.reserve(non_terminal_sets_len);
self.parse_state_queue.reserve(non_terminal_sets_len);
// Add a state for each starting terminal of a non-terminal extra rule. // Add a state for each starting terminal of a non-terminal extra rule.
for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal { for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
if terminal.is_non_terminal() { self.non_terminal_extra_states
Err(ParseTableBuilderError::ImproperNonTerminalExtra( .push((terminal, self.parse_table.states.len()));
self.symbol_name(&terminal), self.add_parse_state(&Vec::new(), &Vec::new(), item_set);
))?;
}
// Add the parse state, and *then* push the terminal and the state id into the
// list of nonterminal extra states
let state_id = self.add_parse_state(&Vec::new(), &Vec::new(), item_set);
self.non_terminal_extra_states.push((terminal, state_id));
} }
while let Some(entry) = self.parse_state_queue.pop_front() { while let Some(entry) = self.parse_state_queue.pop_front() {
@ -346,21 +129,17 @@ impl<'a> ParseTableBuilder<'a> {
} }
if !self.actual_conflicts.is_empty() { if !self.actual_conflicts.is_empty() {
warn!( println!("Warning: unnecessary conflicts");
"unnecessary conflicts:\n {}", for conflict in &self.actual_conflicts {
&self println!(
.actual_conflicts " {}",
.iter() conflict
.map(|conflict| { .iter()
conflict .map(|symbol| format!("`{}`", self.symbol_name(symbol)))
.iter() .collect::<Vec<_>>()
.map(|symbol| format!("`{}`", self.symbol_name(symbol))) .join(", ")
.collect::<Vec<_>>() );
.join(", ") }
})
.collect::<Vec<_>>()
.join("\n ")
);
} }
Ok((self.parse_table, self.parse_state_info_by_id)) Ok((self.parse_table, self.parse_state_info_by_id))
@ -394,7 +173,6 @@ impl<'a> ParseTableBuilder<'a> {
external_lex_state_id: 0, external_lex_state_id: 0,
terminal_entries: IndexMap::default(), terminal_entries: IndexMap::default(),
nonterminal_entries: IndexMap::default(), nonterminal_entries: IndexMap::default(),
reserved_words: TokenSet::default(),
core_id, core_id,
}); });
self.parse_state_queue.push_back(ParseStateQueueEntry { self.parse_state_queue.push_back(ParseStateQueueEntry {
@ -410,10 +188,10 @@ impl<'a> ParseTableBuilder<'a> {
fn add_actions( fn add_actions(
&mut self, &mut self,
mut preceding_symbols: SymbolSequence, mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: AuxiliarySymbolSequence, mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
state_id: ParseStateId, state_id: ParseStateId,
item_set: &ParseItemSet<'a>, item_set: &ParseItemSet<'a>,
) -> BuildTableResult<()> { ) -> Result<()> {
let mut terminal_successors = BTreeMap::new(); let mut terminal_successors = BTreeMap::new();
let mut non_terminal_successors = BTreeMap::new(); let mut non_terminal_successors = BTreeMap::new();
let mut lookaheads_with_conflicts = TokenSet::new(); let mut lookaheads_with_conflicts = TokenSet::new();
@ -421,18 +199,13 @@ impl<'a> ParseTableBuilder<'a> {
// Each item in the item set contributes to either or a Shift action or a Reduce // Each item in the item set contributes to either or a Shift action or a Reduce
// action in this state. // action in this state.
for ParseItemSetEntry { for (item, lookaheads) in &item_set.entries {
item,
lookaheads,
following_reserved_word_set: reserved_lookaheads,
} in &item_set.entries
{
// If the item is unfinished, then this state has a transition for the item's // If the item is unfinished, then this state has a transition for the item's
// next symbol. Advance the item to its next step and insert the resulting // next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set. // item into the successor item set.
if let Some(next_symbol) = item.symbol() { if let Some(next_symbol) = item.symbol() {
let mut successor = item.successor(); let mut successor = item.successor();
let successor_set = if next_symbol.is_non_terminal() { if next_symbol.is_non_terminal() {
let variable = &self.syntax_grammar.variables[next_symbol.index]; let variable = &self.syntax_grammar.variables[next_symbol.index];
// Keep track of where auxiliary non-terminals (repeat symbols) are // Keep track of where auxiliary non-terminals (repeat symbols) are
@ -461,16 +234,13 @@ impl<'a> ParseTableBuilder<'a> {
non_terminal_successors non_terminal_successors
.entry(next_symbol) .entry(next_symbol)
.or_insert_with(ParseItemSet::default) .or_insert_with(ParseItemSet::default)
.insert(successor, lookaheads);
} else { } else {
terminal_successors terminal_successors
.entry(next_symbol) .entry(next_symbol)
.or_insert_with(ParseItemSet::default) .or_insert_with(ParseItemSet::default)
}; .insert(successor, lookaheads);
let successor_entry = successor_set.insert(successor); }
successor_entry.lookaheads.insert_all(lookaheads);
successor_entry.following_reserved_word_set = successor_entry
.following_reserved_word_set
.max(*reserved_lookaheads);
} }
// If the item is finished, then add a Reduce action to this state based // If the item is finished, then add a Reduce action to this state based
// on this item. // on this item.
@ -523,7 +293,7 @@ impl<'a> ParseTableBuilder<'a> {
} }
} }
reduction_info.precedence.clone_from(precedence); reduction_info.precedence = precedence.clone();
if let Err(i) = reduction_info.symbols.binary_search(&symbol) { if let Err(i) = reduction_info.symbols.binary_search(&symbol) {
reduction_info.symbols.insert(i, symbol); reduction_info.symbols.insert(i, symbol);
} }
@ -536,9 +306,7 @@ impl<'a> ParseTableBuilder<'a> {
} }
} }
preceding_auxiliary_symbols.dedup(); // Having computed the the successor item sets for each symbol, add a new
// Having computed the successor item sets for each symbol, add a new
// parse state for each of these item sets, and add a corresponding Shift // parse state for each of these item sets, and add a corresponding Shift
// action to this state. // action to this state.
for (symbol, next_item_set) in terminal_successors { for (symbol, next_item_set) in terminal_successors {
@ -597,7 +365,7 @@ impl<'a> ParseTableBuilder<'a> {
)?; )?;
} }
// Add actions for the grammar's `extra` symbols. // Finally, add actions for the grammar's `extra` symbols.
let state = &mut self.parse_table.states[state_id]; let state = &mut self.parse_table.states[state_id];
let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra(); let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra();
@ -609,7 +377,7 @@ impl<'a> ParseTableBuilder<'a> {
let parent_symbols = item_set let parent_symbols = item_set
.entries .entries
.iter() .iter()
.filter_map(|ParseItemSetEntry { item, .. }| { .filter_map(|(item, _)| {
if !item.is_augmented() && item.step_index > 0 { if !item.is_augmented() && item.step_index > 0 {
Some(item.variable_index) Some(item.variable_index)
} else { } else {
@ -617,18 +385,15 @@ impl<'a> ParseTableBuilder<'a> {
} }
}) })
.collect::<HashSet<_>>(); .collect::<HashSet<_>>();
let parent_symbol_names = parent_symbols let mut message =
.iter() "Extra rules must have unambiguous endings. Conflicting rules: ".to_string();
.map(|&variable_index| { for (i, variable_index) in parent_symbols.iter().enumerate() {
self.syntax_grammar.variables[variable_index as usize] if i > 0 {
.name message += ", ";
.clone() }
}) message += &self.syntax_grammar.variables[*variable_index as usize].name;
.collect::<Vec<_>>(); }
return Err(anyhow!(message));
Err(AmbiguousExtraError {
parent_symbols: parent_symbol_names,
})?;
} }
} }
// Add actions for the start tokens of each non-terminal extra rule. // Add actions for the start tokens of each non-terminal extra rule.
@ -666,30 +431,6 @@ impl<'a> ParseTableBuilder<'a> {
} }
} }
if let Some(keyword_capture_token) = self.syntax_grammar.word_token {
let reserved_word_set_id = item_set
.entries
.iter()
.filter_map(|entry| {
if let Some(next_step) = entry.item.step() {
if next_step.symbol == keyword_capture_token {
Some(next_step.reserved_word_set_id)
} else {
None
}
} else if entry.lookaheads.contains(&keyword_capture_token) {
Some(entry.following_reserved_word_set)
} else {
None
}
})
.max();
if let Some(reserved_word_set_id) = reserved_word_set_id {
state.reserved_words =
self.syntax_grammar.reserved_word_sets[reserved_word_set_id.0].clone();
}
}
Ok(()) Ok(())
} }
@ -701,7 +442,7 @@ impl<'a> ParseTableBuilder<'a> {
preceding_auxiliary_symbols: &[AuxiliarySymbolInfo], preceding_auxiliary_symbols: &[AuxiliarySymbolInfo],
conflicting_lookahead: Symbol, conflicting_lookahead: Symbol,
reduction_info: &ReductionInfo, reduction_info: &ReductionInfo,
) -> BuildTableResult<()> { ) -> Result<()> {
let entry = self.parse_table.states[state_id] let entry = self.parse_table.states[state_id]
.terminal_entries .terminal_entries
.get_mut(&conflicting_lookahead) .get_mut(&conflicting_lookahead)
@ -714,12 +455,9 @@ impl<'a> ParseTableBuilder<'a> {
// REDUCE-REDUCE conflicts where all actions have the *same* // REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts. // precedence, and there can still be SHIFT/REDUCE conflicts.
let mut considered_associativity = false; let mut considered_associativity = false;
let mut shift_precedence = Vec::<(&Precedence, Symbol)>::new(); let mut shift_precedence: Vec<(&Precedence, Symbol)> = Vec::new();
let mut conflicting_items = BTreeSet::new(); let mut conflicting_items = HashSet::new();
for ParseItemSetEntry { for (item, lookaheads) in &item_set.entries {
item, lookaheads, ..
} in &item_set.entries
{
if let Some(step) = item.step() { if let Some(step) = item.step() {
if item.step_index > 0 if item.step_index > 0
&& self && self
@ -856,55 +594,93 @@ impl<'a> ParseTableBuilder<'a> {
return Ok(()); return Ok(());
} }
let mut conflict_error = ConflictError::default(); let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols { for symbol in preceding_symbols {
conflict_error write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
.symbol_sequence
.push(self.symbol_name(symbol));
} }
conflict_error.conflicting_lookahead = self.symbol_name(&conflicting_lookahead);
let interpretations = conflicting_items write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n\n").unwrap();
let mut interpretations = conflicting_items
.iter() .iter()
.map(|item| { .map(|item| {
let preceding_symbols = preceding_symbols let mut line = String::new();
for preceding_symbol in preceding_symbols
.iter() .iter()
.take(preceding_symbols.len() - item.step_index as usize) .take(preceding_symbols.len() - item.step_index as usize)
.map(|symbol| self.symbol_name(symbol)) {
.collect::<Vec<_>>(); write!(&mut line, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
let variable_name = self.syntax_grammar.variables[item.variable_index as usize] write!(
.name &mut line,
.clone(); " ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
let production_step_symbols = item for (j, step) in item.production.steps.iter().enumerate() {
.production if j as u32 == item.step_index {
.steps write!(&mut line, "").unwrap();
.iter() }
.map(|step| self.symbol_name(&step.symbol)) write!(&mut line, " {}", self.symbol_name(&step.symbol)).unwrap();
.collect::<Vec<_>>(); }
let precedence = match item.precedence() { write!(&mut line, ")").unwrap();
Precedence::None => None,
_ => Some(item.precedence().to_string()), if item.is_done() {
write!(
&mut line,
" • {} …",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
let prec_line = if let Some(associativity) = associativity {
Some(format!(
"(precedence: {precedence}, associativity: {associativity:?})",
))
} else if !precedence.is_none() {
Some(format!("(precedence: {precedence})"))
} else {
None
}; };
let associativity = item.associativity().map(|assoc| format!("{assoc:?}")); (line, prec_line)
Interpretation {
preceding_symbols,
variable_name,
production_step_symbols,
step_index: item.step_index,
done: item.is_done(),
conflicting_lookahead: self.symbol_name(&conflicting_lookahead),
precedence,
associativity,
}
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
conflict_error.possible_interpretations = interpretations;
let max_interpretation_length = interpretations
.iter()
.map(|i| i.0.chars().count())
.max()
.unwrap();
interpretations.sort_unstable();
for (i, (line, prec_suffix)) in interpretations.into_iter().enumerate() {
write!(&mut msg, " {}:", i + 1).unwrap();
msg += &line;
if let Some(prec_suffix) = prec_suffix {
for _ in line.chars().count()..max_interpretation_length {
msg.push(' ');
}
msg += " ";
msg += &prec_suffix;
}
msg.push('\n');
}
let mut resolution_count = 0;
write!(&mut msg, "\nPossible resolutions:\n\n").unwrap();
let mut shift_items = Vec::new(); let mut shift_items = Vec::new();
let mut reduce_items = Vec::new(); let mut reduce_items = Vec::new();
for item in conflicting_items { for item in conflicting_items {
@ -917,57 +693,76 @@ impl<'a> ParseTableBuilder<'a> {
shift_items.sort_unstable(); shift_items.sort_unstable();
reduce_items.sort_unstable(); reduce_items.sort_unstable();
let get_rule_names = |items: &[&ParseItem]| -> Vec<String> { let list_rule_names = |mut msg: &mut String, items: &[&ParseItem]| {
let mut last_rule_id = None; let mut last_rule_id = None;
let mut result = Vec::with_capacity(items.len());
for item in items { for item in items {
if last_rule_id == Some(item.variable_index) { if last_rule_id == Some(item.variable_index) {
continue; continue;
} }
last_rule_id = Some(item.variable_index);
result.push(self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)));
}
result if last_rule_id.is_some() {
write!(&mut msg, " and").unwrap();
}
last_rule_id = Some(item.variable_index);
write!(
msg,
" `{}`",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
}; };
if actual_conflict.len() > 1 { if actual_conflict.len() > 1 {
if !shift_items.is_empty() { if !shift_items.is_empty() {
let names = get_rule_names(&shift_items); resolution_count += 1;
conflict_error write!(
.possible_resolutions &mut msg,
.push(Resolution::Precedence { symbols: names }); " {resolution_count}: Specify a higher precedence in",
)
.unwrap();
list_rule_names(&mut msg, &shift_items);
writeln!(&mut msg, " than in the other rules.").unwrap();
} }
for item in &reduce_items { for item in &reduce_items {
let name = self.symbol_name(&Symbol::non_terminal(item.variable_index as usize)); resolution_count += 1;
conflict_error writeln!(
.possible_resolutions &mut msg,
.push(Resolution::Precedence { " {resolution_count}: Specify a higher precedence in `{}` than in the other rules.",
symbols: vec![name], self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
}); )
.unwrap();
} }
} }
if considered_associativity { if considered_associativity {
let names = get_rule_names(&reduce_items); resolution_count += 1;
conflict_error write!(
.possible_resolutions &mut msg,
.push(Resolution::Associativity { symbols: names }); " {resolution_count}: Specify a left or right associativity in",
)
.unwrap();
list_rule_names(&mut msg, &reduce_items);
writeln!(&mut msg).unwrap();
} }
conflict_error resolution_count += 1;
.possible_resolutions write!(
.push(Resolution::AddConflict { &mut msg,
symbols: actual_conflict " {resolution_count}: Add a conflict for these rules: ",
.iter() )
.map(|s| self.symbol_name(s)) .unwrap();
.collect(), for (i, symbol) in actual_conflict.iter().enumerate() {
}); if i > 0 {
write!(&mut msg, ", ").unwrap();
}
write!(&mut msg, "`{}`", self.symbol_name(symbol)).unwrap();
}
writeln!(&mut msg).unwrap();
self.actual_conflicts.insert(actual_conflict); Err(anyhow!(msg))
Err(conflict_error)?
} }
fn compare_precedence( fn compare_precedence(
@ -1036,7 +831,7 @@ impl<'a> ParseTableBuilder<'a> {
let parent_symbols = item_set let parent_symbols = item_set
.entries .entries
.iter() .iter()
.filter_map(|ParseItemSetEntry { item, .. }| { .filter_map(|(item, _)| {
let variable_index = item.variable_index as usize; let variable_index = item.variable_index as usize;
if item.symbol() == Some(symbol) if item.symbol() == Some(symbol)
&& !self.syntax_grammar.variables[variable_index].is_auxiliary() && !self.syntax_grammar.variables[variable_index].is_auxiliary()
@ -1124,24 +919,84 @@ impl<'a> ParseTableBuilder<'a> {
if variable.kind == VariableType::Named { if variable.kind == VariableType::Named {
variable.name.clone() variable.name.clone()
} else { } else {
format!("'{}'", variable.name) format!("'{}'", &variable.name)
} }
} }
} }
} }
} }
fn populate_following_tokens(
result: &mut [TokenSet],
grammar: &SyntaxGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) {
let productions = grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
}
}
}
}
for extra in &grammar.extra_symbols {
if extra.is_terminal() {
for entry in result.iter_mut() {
entry.insert(*extra);
}
result[extra.index] = all_tokens.clone();
}
}
}
pub fn build_parse_table<'a>( pub fn build_parse_table<'a>(
syntax_grammar: &'a SyntaxGrammar, syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar, lexical_grammar: &'a LexicalGrammar,
item_set_builder: ParseItemSetBuilder<'a>, inlines: &'a InlinedProductionMap,
variable_info: &'a [VariableInfo], variable_info: &'a [VariableInfo],
) -> BuildTableResult<(ParseTable, Vec<ParseStateInfo<'a>>)> { ) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
ParseTableBuilder::new( let actual_conflicts = syntax_grammar.expected_conflicts.iter().cloned().collect();
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
populate_following_tokens(
&mut following_tokens,
syntax_grammar,
inlines,
&item_set_builder,
);
let (table, item_sets) = ParseTableBuilder {
syntax_grammar, syntax_grammar,
lexical_grammar, lexical_grammar,
item_set_builder, item_set_builder,
variable_info, variable_info,
) non_terminal_extra_states: Vec::new(),
.build() actual_conflicts,
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
.build()?;
Ok((table, following_tokens, item_sets))
} }

View file

@ -1,11 +1,8 @@
use crate::generate::grammars::LexicalGrammar;
use crate::generate::rules::Symbol;
use crate::generate::tables::{ParseStateId, ParseTable};
use std::fmt; use std::fmt;
use crate::{
grammars::LexicalGrammar,
rules::Symbol,
tables::{ParseStateId, ParseTable},
};
pub struct CoincidentTokenIndex<'a> { pub struct CoincidentTokenIndex<'a> {
entries: Vec<Vec<ParseStateId>>, entries: Vec<Vec<ParseStateId>>,
grammar: &'a LexicalGrammar, grammar: &'a LexicalGrammar,
@ -55,7 +52,7 @@ impl<'a> CoincidentTokenIndex<'a> {
} }
} }
impl fmt::Debug for CoincidentTokenIndex<'_> { impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "CoincidentTokenIndex {{")?; writeln!(f, "CoincidentTokenIndex {{")?;

View file

@ -1,32 +1,26 @@
use std::{ use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
cmp::Ordering, use crate::generate::rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet};
fmt, use lazy_static::lazy_static;
hash::{Hash, Hasher}, use std::cmp::Ordering;
sync::LazyLock, use std::fmt;
}; use std::hash::{Hash, Hasher};
use std::u32;
use crate::{ lazy_static! {
grammars::{ static ref START_PRODUCTION: Production = Production {
LexicalGrammar, Production, ProductionStep, ReservedWordSetId, SyntaxGrammar, dynamic_precedence: 0,
NO_RESERVED_WORDS, steps: vec![ProductionStep {
}, symbol: Symbol {
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet}, index: 0,
}; kind: SymbolType::NonTerminal,
},
static START_PRODUCTION: LazyLock<Production> = LazyLock::new(|| Production { precedence: Precedence::None,
dynamic_precedence: 0, associativity: None,
steps: vec![ProductionStep { alias: None,
symbol: Symbol { field_name: None,
index: 0, }],
kind: SymbolType::NonTerminal, };
}, }
precedence: Precedence::None,
associativity: None,
alias: None,
field_name: None,
reserved_word_set_id: NO_RESERVED_WORDS,
}],
});
/// A [`ParseItem`] represents an in-progress match of a single production in a grammar. /// A [`ParseItem`] represents an in-progress match of a single production in a grammar.
#[derive(Clone, Copy, Debug)] #[derive(Clone, Copy, Debug)]
@ -59,14 +53,7 @@ pub struct ParseItem<'a> {
/// to a state in the final parse table. /// to a state in the final parse table.
#[derive(Clone, Debug, PartialEq, Eq, Default)] #[derive(Clone, Debug, PartialEq, Eq, Default)]
pub struct ParseItemSet<'a> { pub struct ParseItemSet<'a> {
pub entries: Vec<ParseItemSetEntry<'a>>, pub entries: Vec<(ParseItem<'a>, TokenSet)>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ParseItemSetEntry<'a> {
pub item: ParseItem<'a>,
pub lookaheads: TokenSet,
pub following_reserved_word_set: ReservedWordSetId,
} }
/// A [`ParseItemSetCore`] is like a [`ParseItemSet`], but without the lookahead /// A [`ParseItemSetCore`] is like a [`ParseItemSet`], but without the lookahead
@ -141,7 +128,7 @@ impl<'a> ParseItem<'a> {
/// Create an item like this one, but advanced by one step. /// Create an item like this one, but advanced by one step.
#[must_use] #[must_use]
pub const fn successor(&self) -> Self { pub const fn successor(&self) -> ParseItem<'a> {
ParseItem { ParseItem {
variable_index: self.variable_index, variable_index: self.variable_index,
production: self.production, production: self.production,
@ -152,7 +139,7 @@ impl<'a> ParseItem<'a> {
/// Create an item identical to this one, but with a different production. /// Create an item identical to this one, but with a different production.
/// This is used when dynamically "inlining" certain symbols in a production. /// This is used when dynamically "inlining" certain symbols in a production.
pub const fn substitute_production(&self, production: &'a Production) -> Self { pub const fn substitute_production(&self, production: &'a Production) -> ParseItem<'a> {
let mut result = *self; let mut result = *self;
result.production = production; result.production = production;
result result
@ -160,31 +147,35 @@ impl<'a> ParseItem<'a> {
} }
impl<'a> ParseItemSet<'a> { impl<'a> ParseItemSet<'a> {
pub fn insert(&mut self, item: ParseItem<'a>) -> &mut ParseItemSetEntry<'a> { pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, TokenSet)>) -> Self {
match self.entries.binary_search_by(|e| e.item.cmp(&item)) { let mut result = Self::default();
for (item, lookaheads) in elements {
result.insert(item, &lookaheads);
}
result
}
pub fn insert(&mut self, item: ParseItem<'a>, lookaheads: &TokenSet) -> &mut TokenSet {
match self.entries.binary_search_by(|(i, _)| i.cmp(&item)) {
Err(i) => { Err(i) => {
self.entries.insert( self.entries.insert(i, (item, lookaheads.clone()));
i, &mut self.entries[i].1
ParseItemSetEntry { }
item, Ok(i) => {
lookaheads: TokenSet::new(), self.entries[i].1.insert_all(lookaheads);
following_reserved_word_set: ReservedWordSetId::default(), &mut self.entries[i].1
},
);
&mut self.entries[i]
} }
Ok(i) => &mut self.entries[i],
} }
} }
pub fn core(&self) -> ParseItemSetCore<'a> { pub fn core(&self) -> ParseItemSetCore<'a> {
ParseItemSetCore { ParseItemSetCore {
entries: self.entries.iter().map(|e| e.item).collect(), entries: self.entries.iter().map(|e| e.0).collect(),
} }
} }
} }
impl fmt::Display for ParseItemDisplay<'_> { impl<'a> fmt::Display for ParseItemDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if self.0.is_augmented() { if self.0.is_augmented() {
write!(f, "START →")?; write!(f, "START →")?;
@ -192,42 +183,35 @@ impl fmt::Display for ParseItemDisplay<'_> {
write!( write!(
f, f,
"{} →", "{} →",
self.1.variables[self.0.variable_index as usize].name &self.1.variables[self.0.variable_index as usize].name
)?; )?;
} }
for (i, step) in self.0.production.steps.iter().enumerate() { for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize { if i == self.0.step_index as usize {
write!(f, "")?; write!(f, "")?;
if !step.precedence.is_none() if let Some(associativity) = step.associativity {
|| step.associativity.is_some() if step.precedence.is_none() {
|| step.reserved_word_set_id != ReservedWordSetId::default() write!(f, " ({associativity:?})")?;
{ } else {
write!(f, " (")?; write!(f, " ({} {associativity:?})", step.precedence)?;
if !step.precedence.is_none() {
write!(f, " {}", step.precedence)?;
} }
if let Some(associativity) = step.associativity { } else if !step.precedence.is_none() {
write!(f, " {associativity:?}")?; write!(f, " ({})", step.precedence)?;
}
if step.reserved_word_set_id != ReservedWordSetId::default() {
write!(f, "reserved: {}", step.reserved_word_set_id)?;
}
write!(f, " )")?;
} }
} }
write!(f, " ")?; write!(f, " ")?;
if step.symbol.is_terminal() { if step.symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(step.symbol.index) { if let Some(variable) = self.2.variables.get(step.symbol.index) {
write!(f, "{}", variable.name)?; write!(f, "{}", &variable.name)?;
} else { } else {
write!(f, "terminal-{}", step.symbol.index)?; write!(f, "terminal-{}", step.symbol.index)?;
} }
} else if step.symbol.is_external() { } else if step.symbol.is_external() {
write!(f, "{}", self.1.external_tokens[step.symbol.index].name)?; write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
} else { } else {
write!(f, "{}", self.1.variables[step.symbol.index].name)?; write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
} }
if let Some(alias) = &step.alias { if let Some(alias) = &step.alias {
@ -254,33 +238,7 @@ impl fmt::Display for ParseItemDisplay<'_> {
} }
} }
const fn escape_invisible(c: char) -> Option<&'static str> { impl<'a> fmt::Display for TokenSetDisplay<'a> {
Some(match c {
'\n' => "\\n",
'\r' => "\\r",
'\t' => "\\t",
'\0' => "\\0",
'\\' => "\\\\",
'\x0b' => "\\v",
'\x0c' => "\\f",
_ => return None,
})
}
fn display_variable_name(source: &str) -> String {
source
.chars()
.fold(String::with_capacity(source.len()), |mut acc, c| {
if let Some(esc) = escape_invisible(c) {
acc.push_str(esc);
} else {
acc.push(c);
}
acc
})
}
impl fmt::Display for TokenSetDisplay<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "[")?; write!(f, "[")?;
for (i, symbol) in self.0.iter().enumerate() { for (i, symbol) in self.0.iter().enumerate() {
@ -290,14 +248,14 @@ impl fmt::Display for TokenSetDisplay<'_> {
if symbol.is_terminal() { if symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(symbol.index) { if let Some(variable) = self.2.variables.get(symbol.index) {
write!(f, "{}", display_variable_name(&variable.name))?; write!(f, "{}", &variable.name)?;
} else { } else {
write!(f, "terminal-{}", symbol.index)?; write!(f, "terminal-{}", symbol.index)?;
} }
} else if symbol.is_external() { } else if symbol.is_external() {
write!(f, "{}", self.1.external_tokens[symbol.index].name)?; write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
} else { } else {
write!(f, "{}", self.1.variables[symbol.index].name)?; write!(f, "{}", &self.1.variables[symbol.index].name)?;
} }
} }
write!(f, "]")?; write!(f, "]")?;
@ -305,29 +263,21 @@ impl fmt::Display for TokenSetDisplay<'_> {
} }
} }
impl fmt::Display for ParseItemSetDisplay<'_> { impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for entry in &self.0.entries { for (item, lookaheads) in &self.0.entries {
write!( writeln!(
f, f,
"{}\t{}", "{}\t{}",
ParseItemDisplay(&entry.item, self.1, self.2), ParseItemDisplay(item, self.1, self.2),
TokenSetDisplay(&entry.lookaheads, self.1, self.2), TokenSetDisplay(lookaheads, self.1, self.2)
)?; )?;
if entry.following_reserved_word_set != ReservedWordSetId::default() {
write!(
f,
"\treserved word set: {}",
entry.following_reserved_word_set
)?;
}
writeln!(f)?;
} }
Ok(()) Ok(())
} }
} }
impl Hash for ParseItem<'_> { impl<'a> Hash for ParseItem<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) { fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_u32(self.variable_index); hasher.write_u32(self.variable_index);
hasher.write_u32(self.step_index); hasher.write_u32(self.step_index);
@ -341,7 +291,7 @@ impl Hash for ParseItem<'_> {
// this item, unless any of the following are true: // this item, unless any of the following are true:
// * the children have fields // * the children have fields
// * the children have aliases // * the children have aliases
// * the children are hidden and represent rules that have fields. // * the children are hidden and
// See the docs for `has_preceding_inherited_fields`. // See the docs for `has_preceding_inherited_fields`.
for step in &self.production.steps[0..self.step_index as usize] { for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher); step.alias.hash(hasher);
@ -356,7 +306,7 @@ impl Hash for ParseItem<'_> {
} }
} }
impl PartialEq for ParseItem<'_> { impl<'a> PartialEq for ParseItem<'a> {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
if self.variable_index != other.variable_index if self.variable_index != other.variable_index
|| self.step_index != other.step_index || self.step_index != other.step_index
@ -393,7 +343,7 @@ impl PartialEq for ParseItem<'_> {
} }
} }
impl Ord for ParseItem<'_> { impl<'a> Ord for ParseItem<'a> {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
self.step_index self.step_index
.cmp(&other.step_index) .cmp(&other.step_index)
@ -433,26 +383,25 @@ impl Ord for ParseItem<'_> {
} }
} }
impl PartialOrd for ParseItem<'_> { impl<'a> PartialOrd for ParseItem<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl Eq for ParseItem<'_> {} impl<'a> Eq for ParseItem<'a> {}
impl Hash for ParseItemSet<'_> { impl<'a> Hash for ParseItemSet<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) { fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len()); hasher.write_usize(self.entries.len());
for entry in &self.entries { for (item, lookaheads) in &self.entries {
entry.item.hash(hasher); item.hash(hasher);
entry.lookaheads.hash(hasher); lookaheads.hash(hasher);
entry.following_reserved_word_set.hash(hasher);
} }
} }
} }
impl Hash for ParseItemSetCore<'_> { impl<'a> Hash for ParseItemSetCore<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) { fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len()); hasher.write_usize(self.entries.len());
for item in &self.entries { for item in &self.entries {

View file

@ -1,13 +1,8 @@
use std::{ use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSetDisplay};
collections::{HashMap, HashSet}, use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
fmt, use crate::generate::rules::{Symbol, SymbolType, TokenSet};
}; use std::collections::{HashMap, HashSet};
use std::fmt;
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, ParseItemSetEntry, TokenSetDisplay};
use crate::{
grammars::{InlinedProductionMap, LexicalGrammar, ReservedWordSetId, SyntaxGrammar},
rules::{Symbol, SymbolType, TokenSet},
};
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
struct TransitiveClosureAddition<'a> { struct TransitiveClosureAddition<'a> {
@ -15,10 +10,9 @@ struct TransitiveClosureAddition<'a> {
info: FollowSetInfo, info: FollowSetInfo,
} }
#[derive(Clone, Debug, Default, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
struct FollowSetInfo { struct FollowSetInfo {
lookaheads: TokenSet, lookaheads: TokenSet,
reserved_lookaheads: ReservedWordSetId,
propagates_lookaheads: bool, propagates_lookaheads: bool,
} }
@ -26,7 +20,6 @@ pub struct ParseItemSetBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar, syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar, lexical_grammar: &'a LexicalGrammar,
first_sets: HashMap<Symbol, TokenSet>, first_sets: HashMap<Symbol, TokenSet>,
reserved_first_sets: HashMap<Symbol, ReservedWordSetId>,
last_sets: HashMap<Symbol, TokenSet>, last_sets: HashMap<Symbol, TokenSet>,
inlines: &'a InlinedProductionMap, inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>, transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
@ -48,7 +41,6 @@ impl<'a> ParseItemSetBuilder<'a> {
syntax_grammar, syntax_grammar,
lexical_grammar, lexical_grammar,
first_sets: HashMap::new(), first_sets: HashMap::new(),
reserved_first_sets: HashMap::new(),
last_sets: HashMap::new(), last_sets: HashMap::new(),
inlines, inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
@ -57,7 +49,8 @@ impl<'a> ParseItemSetBuilder<'a> {
// For each grammar symbol, populate the FIRST and LAST sets: the set of // For each grammar symbol, populate the FIRST and LAST sets: the set of
// terminals that appear at the beginning and end that symbol's productions, // terminals that appear at the beginning and end that symbol's productions,
// respectively. // respectively.
// For a terminal symbol, the FIRST and LAST sets just consist of the //
// For a terminal symbol, the FIRST and LAST set just consists of the
// terminal itself. // terminal itself.
for i in 0..lexical_grammar.variables.len() { for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i); let symbol = Symbol::terminal(i);
@ -65,9 +58,6 @@ impl<'a> ParseItemSetBuilder<'a> {
set.insert(symbol); set.insert(symbol);
result.first_sets.insert(symbol, set.clone()); result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set); result.last_sets.insert(symbol, set);
result
.reserved_first_sets
.insert(symbol, ReservedWordSetId::default());
} }
for i in 0..syntax_grammar.external_tokens.len() { for i in 0..syntax_grammar.external_tokens.len() {
@ -76,15 +66,12 @@ impl<'a> ParseItemSetBuilder<'a> {
set.insert(symbol); set.insert(symbol);
result.first_sets.insert(symbol, set.clone()); result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set); result.last_sets.insert(symbol, set);
result
.reserved_first_sets
.insert(symbol, ReservedWordSetId::default());
} }
// The FIRST set of a non-terminal `i` is the union of the FIRST sets // The FIRST set of a non-terminal `i` is the union of the following sets:
// of all the symbols that appear at the beginnings of i's productions. Some // * the set of all terminals that appear at the beginnings of i's productions
// of these symbols may themselves be non-terminals, so this is a recursive // * the FIRST sets of all the non-terminals that appear at the beginnings
// definition. // of i's productions
// //
// Rather than computing these sets using recursion, we use an explicit stack // Rather than computing these sets using recursion, we use an explicit stack
// called `symbols_to_process`. // called `symbols_to_process`.
@ -92,36 +79,37 @@ impl<'a> ParseItemSetBuilder<'a> {
let mut processed_non_terminals = HashSet::new(); let mut processed_non_terminals = HashSet::new();
for i in 0..syntax_grammar.variables.len() { for i in 0..syntax_grammar.variables.len() {
let symbol = Symbol::non_terminal(i); let symbol = Symbol::non_terminal(i);
let first_set = result.first_sets.entry(symbol).or_default();
let reserved_first_set = result.reserved_first_sets.entry(symbol).or_default();
let first_set = result
.first_sets
.entry(symbol)
.or_insert_with(TokenSet::new);
processed_non_terminals.clear(); processed_non_terminals.clear();
symbols_to_process.clear(); symbols_to_process.clear();
symbols_to_process.push(symbol); symbols_to_process.push(symbol);
while let Some(sym) = symbols_to_process.pop() { while let Some(current_symbol) = symbols_to_process.pop() {
for production in &syntax_grammar.variables[sym.index].productions { if current_symbol.is_terminal() || current_symbol.is_external() {
if let Some(step) = production.steps.first() { first_set.insert(current_symbol);
if step.symbol.is_terminal() || step.symbol.is_external() { } else if processed_non_terminals.insert(current_symbol) {
first_set.insert(step.symbol); for production in &syntax_grammar.variables[current_symbol.index].productions {
} else if processed_non_terminals.insert(step.symbol) { if let Some(step) = production.steps.first() {
symbols_to_process.push(step.symbol); symbols_to_process.push(step.symbol);
} }
*reserved_first_set = (*reserved_first_set).max(step.reserved_word_set_id);
} }
} }
} }
// The LAST set is defined in a similar way to the FIRST set. // The LAST set is defined in a similar way to the FIRST set.
let last_set = result.last_sets.entry(symbol).or_default(); let last_set = result.last_sets.entry(symbol).or_insert_with(TokenSet::new);
processed_non_terminals.clear(); processed_non_terminals.clear();
symbols_to_process.clear(); symbols_to_process.clear();
symbols_to_process.push(symbol); symbols_to_process.push(symbol);
while let Some(sym) = symbols_to_process.pop() { while let Some(current_symbol) = symbols_to_process.pop() {
for production in &syntax_grammar.variables[sym.index].productions { if current_symbol.is_terminal() || current_symbol.is_external() {
if let Some(step) = production.steps.last() { last_set.insert(current_symbol);
if step.symbol.is_terminal() || step.symbol.is_external() { } else if processed_non_terminals.insert(current_symbol) {
last_set.insert(step.symbol); for production in &syntax_grammar.variables[current_symbol.index].productions {
} else if processed_non_terminals.insert(step.symbol) { if let Some(step) = production.steps.last() {
symbols_to_process.push(step.symbol); symbols_to_process.push(step.symbol);
} }
} }
@ -131,75 +119,67 @@ impl<'a> ParseItemSetBuilder<'a> {
// To compute an item set's transitive closure, we find each item in the set // To compute an item set's transitive closure, we find each item in the set
// whose next symbol is a non-terminal, and we add new items to the set for // whose next symbol is a non-terminal, and we add new items to the set for
// each of that symbol's productions. These productions might themselves begin // each of that symbols' productions. These productions might themselves begin
// with non-terminals, so the process continues recursively. In this process, // with non-terminals, so the process continues recursively. In this process,
// the total set of entries that get added depends only on two things: // the total set of entries that get added depends only on two things:
// // * the set of non-terminal symbols that occur at each item's current position
// * the non-terminal symbol that occurs next in each item // * the set of terminals that occurs after each of these non-terminal symbols
//
// * the set of terminals that can follow that non-terminal symbol in the item
// //
// So we can avoid a lot of duplicated recursive work by precomputing, for each // So we can avoid a lot of duplicated recursive work by precomputing, for each
// non-terminal symbol `i`, a final list of *additions* that must be made to an // non-terminal symbol `i`, a final list of *additions* that must be made to an
// item set when symbol `i` occurs as the next symbol in one if its core items. // item set when `i` occurs as the next symbol in one if its core items. The
// The structure of a precomputed *addition* is as follows: // structure of an *addition* is as follows:
// * `item` - the new item that must be added as part of the expansion of `i`
// * `lookaheads` - lookahead tokens that can always come after that item in
// the expansion of `i`
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can
// occur at the *end* of the expansion of `i`, so that i's own current
// lookahead tokens can occur after `item`.
// //
// * `item` - the new item that must be added as part of the expansion of the symbol `i`. // Again, rather than computing these additions recursively, we use an explicit
// // stack called `entries_to_process`.
// * `lookaheads` - the set of possible lookahead tokens that can always come after `item`
// in an expansion of symbol `i`.
//
// * `reserved_lookaheads` - the set of reserved lookahead lookahead tokens that can
// always come after `item` in the expansion of symbol `i`.
//
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the
// *end* of the expansion of symbol `i`, so that i's own current lookahead tokens can
// occur after `item`.
//
// Rather than computing these additions recursively, we use an explicit stack.
let empty_lookaheads = TokenSet::new();
let mut stack = Vec::new();
let mut follow_set_info_by_non_terminal = HashMap::<usize, FollowSetInfo>::new();
for i in 0..syntax_grammar.variables.len() { for i in 0..syntax_grammar.variables.len() {
let empty_lookaheads = TokenSet::new();
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
// First, build up a map whose keys are all of the non-terminals that can // First, build up a map whose keys are all of the non-terminals that can
// appear at the beginning of non-terminal `i`, and whose values store // appear at the beginning of non-terminal `i`, and whose values store
// information about the tokens that can follow those non-terminals. // information about the tokens that can follow each non-terminal.
stack.clear(); let mut follow_set_info_by_non_terminal = HashMap::new();
stack.push((i, &empty_lookaheads, ReservedWordSetId::default(), true)); while let Some(entry) = entries_to_process.pop() {
follow_set_info_by_non_terminal.clear(); let (variable_index, lookaheads, propagates_lookaheads) = entry;
while let Some((sym_ix, lookaheads, reserved_word_set_id, propagates_lookaheads)) = let existing_info = follow_set_info_by_non_terminal
stack.pop() .entry(variable_index)
{ .or_insert_with(|| FollowSetInfo {
let mut did_add = false; lookaheads: TokenSet::new(),
let info = follow_set_info_by_non_terminal.entry(sym_ix).or_default(); propagates_lookaheads: false,
did_add |= info.lookaheads.insert_all(lookaheads); });
if reserved_word_set_id > info.reserved_lookaheads {
info.reserved_lookaheads = reserved_word_set_id; let did_add_follow_set_info;
did_add = true; if propagates_lookaheads {
} did_add_follow_set_info = !existing_info.propagates_lookaheads;
did_add |= propagates_lookaheads && !info.propagates_lookaheads; existing_info.propagates_lookaheads = true;
info.propagates_lookaheads |= propagates_lookaheads; } else {
if !did_add { did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
continue;
} }
for production in &syntax_grammar.variables[sym_ix].productions { if did_add_follow_set_info {
if let Some(symbol) = production.first_symbol() { for production in &syntax_grammar.variables[variable_index].productions {
if symbol.is_non_terminal() { if let Some(symbol) = production.first_symbol() {
if let Some(next_step) = production.steps.get(1) { if symbol.is_non_terminal() {
stack.push(( if production.steps.len() == 1 {
symbol.index, entries_to_process.push((
&result.first_sets[&next_step.symbol], symbol.index,
result.reserved_first_sets[&next_step.symbol], lookaheads,
false, propagates_lookaheads,
)); ));
} else { } else {
stack.push(( entries_to_process.push((
symbol.index, symbol.index,
lookaheads, &result.first_sets[&production.steps[1].symbol],
reserved_word_set_id, false,
propagates_lookaheads, ));
)); }
} }
} }
} }
@ -209,7 +189,7 @@ impl<'a> ParseItemSetBuilder<'a> {
// Store all of those non-terminals' productions, along with their associated // Store all of those non-terminals' productions, along with their associated
// lookahead info, as *additions* associated with non-terminal `i`. // lookahead info, as *additions* associated with non-terminal `i`.
let additions_for_non_terminal = &mut result.transitive_closure_additions[i]; let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (&variable_index, follow_set_info) in &follow_set_info_by_non_terminal { for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index]; let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index); let non_terminal = Symbol::non_terminal(variable_index);
let variable_index = variable_index as u32; let variable_index = variable_index as u32;
@ -252,25 +232,22 @@ impl<'a> ParseItemSetBuilder<'a> {
result result
} }
pub fn transitive_closure(&self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> { pub fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default(); let mut result = ParseItemSet::default();
for entry in &item_set.entries { for (item, lookaheads) in &item_set.entries {
if let Some(productions) = self if let Some(productions) = self
.inlines .inlines
.inlined_productions(entry.item.production, entry.item.step_index) .inlined_productions(item.production, item.step_index)
{ {
for production in productions { for production in productions {
self.add_item( self.add_item(
&mut result, &mut result,
&ParseItemSetEntry { item.substitute_production(production),
item: entry.item.substitute_production(production), lookaheads,
lookaheads: entry.lookaheads.clone(),
following_reserved_word_set: entry.following_reserved_word_set,
},
); );
} }
} else { } else {
self.add_item(&mut result, entry); self.add_item(&mut result, *item, lookaheads);
} }
} }
result result
@ -280,68 +257,34 @@ impl<'a> ParseItemSetBuilder<'a> {
&self.first_sets[symbol] &self.first_sets[symbol]
} }
pub fn reserved_first_set(&self, symbol: &Symbol) -> Option<&TokenSet> {
let id = *self.reserved_first_sets.get(symbol)?;
Some(&self.syntax_grammar.reserved_word_sets[id.0])
}
pub fn last_set(&self, symbol: &Symbol) -> &TokenSet { pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
&self.last_sets[symbol] &self.last_sets[symbol]
} }
fn add_item(&self, set: &mut ParseItemSet<'a>, entry: &ParseItemSetEntry<'a>) { fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
if let Some(step) = entry.item.step() { if let Some(step) = item.step() {
if step.symbol.is_non_terminal() { if step.symbol.is_non_terminal() {
let next_step = entry.item.successor().step(); let next_step = item.successor().step();
// Determine which tokens can follow this non-terminal. // Determine which tokens can follow this non-terminal.
let (following_tokens, following_reserved_tokens) = let following_tokens = next_step.map_or(lookaheads, |next_step| {
if let Some(next_step) = next_step { self.first_sets.get(&next_step.symbol).unwrap()
( });
self.first_sets.get(&next_step.symbol).unwrap(),
*self.reserved_first_sets.get(&next_step.symbol).unwrap(),
)
} else {
(&entry.lookaheads, entry.following_reserved_word_set)
};
// Use the pre-computed *additions* to expand the non-terminal. // Use the pre-computed *additions* to expand the non-terminal.
for addition in &self.transitive_closure_additions[step.symbol.index] { for addition in &self.transitive_closure_additions[step.symbol.index] {
let entry = set.insert(addition.item); let lookaheads = set.insert(addition.item, &addition.info.lookaheads);
entry.lookaheads.insert_all(&addition.info.lookaheads);
if let Some(word_token) = self.syntax_grammar.word_token {
if addition.info.lookaheads.contains(&word_token) {
entry.following_reserved_word_set = entry
.following_reserved_word_set
.max(addition.info.reserved_lookaheads);
}
}
if addition.info.propagates_lookaheads { if addition.info.propagates_lookaheads {
entry.lookaheads.insert_all(following_tokens); lookaheads.insert_all(following_tokens);
if let Some(word_token) = self.syntax_grammar.word_token {
if following_tokens.contains(&word_token) {
entry.following_reserved_word_set = entry
.following_reserved_word_set
.max(following_reserved_tokens);
}
}
} }
} }
} }
} }
set.insert(item, lookaheads);
let e = set.insert(entry.item);
e.lookaheads.insert_all(&entry.lookaheads);
e.following_reserved_word_set = e
.following_reserved_word_set
.max(entry.following_reserved_word_set);
} }
} }
impl fmt::Debug for ParseItemSetBuilder<'_> { impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "ParseItemSetBuilder {{")?; writeln!(f, "ParseItemSetBuilder {{")?;

View file

@ -1,18 +1,13 @@
use std::{
collections::{HashMap, HashSet},
mem,
};
use log::debug;
use super::token_conflicts::TokenConflictMap; use super::token_conflicts::TokenConflictMap;
use crate::{ use crate::generate::dedup::split_state_id_groups;
dedup::split_state_id_groups, use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
grammars::{LexicalGrammar, SyntaxGrammar, VariableType}, use crate::generate::rules::{AliasMap, Symbol, TokenSet};
rules::{AliasMap, Symbol, TokenSet}, use crate::generate::tables::{
tables::{GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry}, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
OptLevel,
}; };
use log::info;
use std::collections::{HashMap, HashSet};
use std::mem;
pub fn minimize_parse_table( pub fn minimize_parse_table(
parse_table: &mut ParseTable, parse_table: &mut ParseTable,
@ -21,7 +16,6 @@ pub fn minimize_parse_table(
simple_aliases: &AliasMap, simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap, token_conflict_map: &TokenConflictMap,
keywords: &TokenSet, keywords: &TokenSet,
optimizations: OptLevel,
) { ) {
let mut minimizer = Minimizer { let mut minimizer = Minimizer {
parse_table, parse_table,
@ -31,9 +25,7 @@ pub fn minimize_parse_table(
keywords, keywords,
simple_aliases, simple_aliases,
}; };
if optimizations.contains(OptLevel::MergeStates) { minimizer.merge_compatible_states();
minimizer.merge_compatible_states();
}
minimizer.remove_unit_reductions(); minimizer.remove_unit_reductions();
minimizer.remove_unused_states(); minimizer.remove_unused_states();
minimizer.reorder_states_by_descending_size(); minimizer.reorder_states_by_descending_size();
@ -48,7 +40,7 @@ struct Minimizer<'a> {
simple_aliases: &'a AliasMap, simple_aliases: &'a AliasMap,
} }
impl Minimizer<'_> { impl<'a> Minimizer<'a> {
fn remove_unit_reductions(&mut self) { fn remove_unit_reductions(&mut self) {
let mut aliased_symbols = HashSet::new(); let mut aliased_symbols = HashSet::new();
for variable in &self.syntax_grammar.variables { for variable in &self.syntax_grammar.variables {
@ -74,17 +66,18 @@ impl Minimizer<'_> {
production_id: 0, production_id: 0,
symbol, symbol,
.. ..
} if !self.simple_aliases.contains_key(symbol) } => {
&& !self.syntax_grammar.supertype_symbols.contains(symbol) if !self.simple_aliases.contains_key(symbol)
&& !self.syntax_grammar.extra_symbols.contains(symbol) && !self.syntax_grammar.supertype_symbols.contains(symbol)
&& !aliased_symbols.contains(symbol) && !aliased_symbols.contains(symbol)
&& self.syntax_grammar.variables[symbol.index].kind && self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named != VariableType::Named
&& (unit_reduction_symbol.is_none() && (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol)) => || unit_reduction_symbol == Some(symbol))
{ {
unit_reduction_symbol = Some(symbol); unit_reduction_symbol = Some(symbol);
continue; continue;
}
} }
_ => {} _ => {}
} }
@ -155,7 +148,9 @@ impl Minimizer<'_> {
&mut group_ids_by_state_id, &mut group_ids_by_state_id,
0, 0,
|left, right, groups| self.state_successors_differ(left, right, groups), |left, right, groups| self.state_successors_differ(left, right, groups),
) {} ) {
continue;
}
let error_group_index = state_ids_by_group_id let error_group_index = state_ids_by_group_id
.iter() .iter()
@ -172,12 +167,17 @@ impl Minimizer<'_> {
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len()); let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id { for state_ids in &state_ids_by_group_id {
// Initialize the new state based on the first old state in the group. // Initialize the new state based on the first old state in the group.
let mut parse_state = mem::take(&mut self.parse_table.states[state_ids[0]]); let mut parse_state = ParseState::default();
mem::swap(&mut parse_state, &mut self.parse_table.states[state_ids[0]]);
// Extend the new state with all of the actions from the other old states // Extend the new state with all of the actions from the other old states
// in the group. // in the group.
for state_id in &state_ids[1..] { for state_id in &state_ids[1..] {
let other_parse_state = mem::take(&mut self.parse_table.states[*state_id]); let mut other_parse_state = ParseState::default();
mem::swap(
&mut other_parse_state,
&mut self.parse_table.states[*state_id],
);
parse_state parse_state
.terminal_entries .terminal_entries
@ -185,12 +185,6 @@ impl Minimizer<'_> {
parse_state parse_state
.nonterminal_entries .nonterminal_entries
.extend(other_parse_state.nonterminal_entries); .extend(other_parse_state.nonterminal_entries);
parse_state
.reserved_words
.insert_all(&other_parse_state.reserved_words);
for symbol in parse_state.terminal_entries.keys() {
parse_state.reserved_words.remove(symbol);
}
} }
// Update the new state's outgoing references using the new grouping. // Update the new state's outgoing references using the new grouping.
@ -219,14 +213,24 @@ impl Minimizer<'_> {
) { ) {
return true; return true;
} }
} else if self.token_conflicts(left_state.id, right_state.id, right_state, *token) { } else if self.token_conflicts(
left_state.id,
right_state.id,
right_state.terminal_entries.keys(),
*token,
) {
return true; return true;
} }
} }
for token in right_state.terminal_entries.keys() { for token in right_state.terminal_entries.keys() {
if !left_state.terminal_entries.contains_key(token) if !left_state.terminal_entries.contains_key(token)
&& self.token_conflicts(left_state.id, right_state.id, left_state, *token) && self.token_conflicts(
left_state.id,
right_state.id,
left_state.terminal_entries.keys(),
*token,
)
{ {
return true; return true;
} }
@ -248,7 +252,7 @@ impl Minimizer<'_> {
let group1 = group_ids_by_state_id[*s1]; let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2]; let group2 = group_ids_by_state_id[*s2];
if group1 != group2 { if group1 != group2 {
debug!( info!(
"split states {} {} - successors for {} are split: {s1} {s2}", "split states {} {} - successors for {} are split: {s1} {s2}",
state1.id, state1.id,
state2.id, state2.id,
@ -264,12 +268,12 @@ impl Minimizer<'_> {
for (symbol, s1) in &state1.nonterminal_entries { for (symbol, s1) in &state1.nonterminal_entries {
if let Some(s2) = state2.nonterminal_entries.get(symbol) { if let Some(s2) = state2.nonterminal_entries.get(symbol) {
match (s1, s2) { match (s1, s2) {
(GotoAction::ShiftExtra, GotoAction::ShiftExtra) => {} (GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
(GotoAction::Goto(s1), GotoAction::Goto(s2)) => { (GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
let group1 = group_ids_by_state_id[*s1]; let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2]; let group2 = group_ids_by_state_id[*s2];
if group1 != group2 { if group1 != group2 {
debug!( info!(
"split states {} {} - successors for {} are split: {s1} {s2}", "split states {} {} - successors for {} are split: {s1} {s2}",
state1.id, state1.id,
state2.id, state2.id,
@ -299,14 +303,16 @@ impl Minimizer<'_> {
let actions1 = &entry1.actions; let actions1 = &entry1.actions;
let actions2 = &entry2.actions; let actions2 = &entry2.actions;
if actions1.len() != actions2.len() { if actions1.len() != actions2.len() {
debug!( info!(
"split states {state_id1} {state_id2} - differing action counts for token {}", "split states {state_id1} {state_id2} - differing action counts for token {}",
self.symbol_name(token) self.symbol_name(token)
); );
return true; return true;
} }
for (action1, action2) in actions1.iter().zip(actions2.iter()) { for (i, action1) in actions1.iter().enumerate() {
let action2 = &actions2[i];
// Two shift actions are equivalent if their destinations are in the same group. // Two shift actions are equivalent if their destinations are in the same group.
if let ( if let (
ParseAction::Shift { ParseAction::Shift {
@ -324,13 +330,13 @@ impl Minimizer<'_> {
if group1 == group2 && is_repetition1 == is_repetition2 { if group1 == group2 && is_repetition1 == is_repetition2 {
continue; continue;
} }
debug!( info!(
"split states {state_id1} {state_id2} - successors for {} are split: {s1} {s2}", "split states {state_id1} {state_id2} - successors for {} are split: {s1} {s2}",
self.symbol_name(token), self.symbol_name(token),
); );
return true; return true;
} else if action1 != action2 { } else if action1 != action2 {
debug!( info!(
"split states {state_id1} {state_id2} - unequal actions for {}", "split states {state_id1} {state_id2} - unequal actions for {}",
self.symbol_name(token), self.symbol_name(token),
); );
@ -341,32 +347,28 @@ impl Minimizer<'_> {
false false
} }
fn token_conflicts( fn token_conflicts<'b>(
&self, &self,
left_id: ParseStateId, left_id: ParseStateId,
right_id: ParseStateId, right_id: ParseStateId,
right_state: &ParseState, existing_tokens: impl Iterator<Item = &'b Symbol>,
new_token: Symbol, new_token: Symbol,
) -> bool { ) -> bool {
if new_token == Symbol::end_of_nonterminal_extra() { if new_token == Symbol::end_of_nonterminal_extra() {
debug!("split states {left_id} {right_id} - end of non-terminal extra",); info!("split states {left_id} {right_id} - end of non-terminal extra",);
return true; return true;
} }
// Do not add external tokens; they could conflict lexically with any of the state's // Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens. // existing lookahead tokens.
if new_token.is_external() { if new_token.is_external() {
debug!( info!(
"split states {left_id} {right_id} - external token {}", "split states {left_id} {right_id} - external token {}",
self.symbol_name(&new_token), self.symbol_name(&new_token),
); );
return true; return true;
} }
if right_state.reserved_words.contains(&new_token) {
return false;
}
// Do not add tokens which are both internal and external. Their validity could // Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner. // influence the behavior of the external scanner.
if self if self
@ -375,7 +377,7 @@ impl Minimizer<'_> {
.iter() .iter()
.any(|external| external.corresponding_internal_token == Some(new_token)) .any(|external| external.corresponding_internal_token == Some(new_token))
{ {
debug!( info!(
"split states {left_id} {right_id} - internal/external token {}", "split states {left_id} {right_id} - internal/external token {}",
self.symbol_name(&new_token), self.symbol_name(&new_token),
); );
@ -383,30 +385,23 @@ impl Minimizer<'_> {
} }
// Do not add a token if it conflicts with an existing token. // Do not add a token if it conflicts with an existing token.
for token in right_state.terminal_entries.keys().copied() { for token in existing_tokens {
if !token.is_terminal() { if token.is_terminal()
continue; && !(self.syntax_grammar.word_token == Some(*token)
} && self.keywords.contains(&new_token))
if self.syntax_grammar.word_token == Some(token) && self.keywords.contains(&new_token) { && !(self.syntax_grammar.word_token == Some(new_token)
continue; && self.keywords.contains(token))
} && (self
if self.syntax_grammar.word_token == Some(new_token) && self.keywords.contains(&token) {
continue;
}
if self
.token_conflict_map
.does_conflict(new_token.index, token.index)
|| self
.token_conflict_map .token_conflict_map
.does_match_same_string(new_token.index, token.index) .does_conflict(new_token.index, token.index)
|| self
.token_conflict_map
.does_match_same_string(new_token.index, token.index))
{ {
debug!( info!(
"split states {} {} - token {} conflicts with {}", "split states {left_id} {right_id} - token {} conflicts with {}",
left_id,
right_id,
self.symbol_name(&new_token), self.symbol_name(&new_token),
self.symbol_name(&token), self.symbol_name(token),
); );
return true; return true;
} }

View file

@ -1,42 +1,25 @@
mod build_lex_table; pub mod build_lex_table;
mod build_parse_table; pub mod build_parse_table;
mod coincident_tokens; mod coincident_tokens;
mod item; mod item;
mod item_set_builder; mod item_set_builder;
mod minimize_parse_table; mod minimize_parse_table;
mod token_conflicts; mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::{build_parse_table, ParseStateInfo};
use self::coincident_tokens::CoincidentTokenIndex;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::NfaCursor;
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
use anyhow::Result;
use log::info;
use std::collections::{BTreeSet, HashMap}; use std::collections::{BTreeSet, HashMap};
pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT;
use build_parse_table::BuildTableResult;
pub use build_parse_table::ParseTableBuilderError;
use log::{debug, info};
use self::{
build_lex_table::build_lex_table,
build_parse_table::{build_parse_table, ParseStateInfo},
coincident_tokens::CoincidentTokenIndex,
item_set_builder::ParseItemSetBuilder,
minimize_parse_table::minimize_parse_table,
token_conflicts::TokenConflictMap,
};
use crate::{
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor},
node_types::VariableInfo,
rules::{AliasMap, Symbol, SymbolType, TokenSet},
tables::{LexTable, ParseAction, ParseTable, ParseTableEntry},
OptLevel,
};
pub struct Tables {
pub parse_table: ParseTable,
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
pub fn build_tables( pub fn build_tables(
syntax_grammar: &SyntaxGrammar, syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
@ -44,17 +27,9 @@ pub fn build_tables(
variable_info: &[VariableInfo], variable_info: &[VariableInfo],
inlines: &InlinedProductionMap, inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>, report_symbol_name: Option<&str>,
optimizations: OptLevel, ) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
) -> BuildTableResult<Tables> { let (mut parse_table, following_tokens, parse_state_info) =
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines); build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
let following_tokens =
get_following_tokens(syntax_grammar, lexical_grammar, inlines, &item_set_builder);
let (mut parse_table, parse_state_info) = build_parse_table(
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords( let keywords = identify_keywords(
@ -80,9 +55,8 @@ pub fn build_tables(
simple_aliases, simple_aliases,
&token_conflict_map, &token_conflict_map,
&keywords, &keywords,
optimizations,
); );
let lex_tables = build_lex_table( let (main_lex_table, keyword_lex_table) = build_lex_table(
&mut parse_table, &mut parse_table,
syntax_grammar, syntax_grammar,
lexical_grammar, lexical_grammar,
@ -102,58 +76,12 @@ pub fn build_tables(
report_symbol_name, report_symbol_name,
); );
} }
Ok((
if parse_table.states.len() > u16::MAX as usize {
Err(ParseTableBuilderError::StateCount(parse_table.states.len()))?;
}
Ok(Tables {
parse_table, parse_table,
main_lex_table: lex_tables.main_lex_table, main_lex_table,
keyword_lex_table: lex_tables.keyword_lex_table, keyword_lex_table,
large_character_sets: lex_tables.large_character_sets, syntax_grammar.word_token,
}) ))
}
fn get_following_tokens(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) -> Vec<TokenSet> {
let mut result = vec![TokenSet::new(); lexical_grammar.variables.len()];
let productions = syntax_grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
let right_reserved_tokens = builder.reserved_first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
if let Some(reserved_tokens) = right_reserved_tokens {
result[left_token.index].insert_all_terminals(reserved_tokens);
}
}
}
}
}
for extra in &syntax_grammar.extra_symbols {
if extra.is_terminal() {
for entry in &mut result {
entry.insert(*extra);
}
result[extra.index] = all_tokens.clone();
}
}
result
} }
fn populate_error_state( fn populate_error_state(
@ -169,7 +97,7 @@ fn populate_error_state(
// First identify the *conflict-free tokens*: tokens that do not overlap with // First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way, besides matching exactly the same string. // any other token in any way, besides matching exactly the same string.
let conflict_free_tokens = (0..n) let conflict_free_tokens: TokenSet = (0..n)
.filter_map(|i| { .filter_map(|i| {
let conflicts_with_other_tokens = (0..n).any(|j| { let conflicts_with_other_tokens = (0..n).any(|j| {
j != i j != i
@ -179,14 +107,14 @@ fn populate_error_state(
if conflicts_with_other_tokens { if conflicts_with_other_tokens {
None None
} else { } else {
debug!( info!(
"error recovery - token {} has no conflicts", "error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name lexical_grammar.variables[i].name
); );
Some(Symbol::terminal(i)) Some(Symbol::terminal(i))
} }
}) })
.collect::<TokenSet>(); .collect();
let recover_entry = ParseTableEntry { let recover_entry = ParseTableEntry {
reusable: false, reusable: false,
@ -205,14 +133,14 @@ fn populate_error_state(
!coincident_token_index.contains(symbol, *t) !coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index) && token_conflict_map.does_conflict(symbol.index, t.index)
}) { }) {
debug!( info!(
"error recovery - exclude token {} because of conflict with {}", "error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
); );
continue; continue;
} }
} }
debug!( info!(
"error recovery - include token {}", "error recovery - include token {}",
lexical_grammar.variables[i].name lexical_grammar.variables[i].name
); );
@ -263,7 +191,7 @@ fn populate_used_symbols(
// ensure that a subtree's symbol can be successfully reassigned to the word token // ensure that a subtree's symbol can be successfully reassigned to the word token
// without having to move the subtree to the heap. // without having to move the subtree to the heap.
// See https://github.com/tree-sitter/tree-sitter/issues/258 // See https://github.com/tree-sitter/tree-sitter/issues/258
if syntax_grammar.word_token.is_some_and(|t| t.index == i) { if syntax_grammar.word_token.map_or(false, |t| t.index == i) {
parse_table.symbols.insert(1, Symbol::terminal(i)); parse_table.symbols.insert(1, Symbol::terminal(i));
} else { } else {
parse_table.symbols.push(Symbol::terminal(i)); parse_table.symbols.push(Symbol::terminal(i));
@ -335,7 +263,7 @@ fn identify_keywords(
// First find all of the candidate keyword tokens: tokens that start with // First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token. // letters or underscore and can match the same string as a word token.
let keyword_candidates = lexical_grammar let keyword_candidates: TokenSet = lexical_grammar
.variables .variables
.iter() .iter()
.enumerate() .enumerate()
@ -345,7 +273,7 @@ fn identify_keywords(
&& token_conflict_map.does_match_same_string(i, word_token.index) && token_conflict_map.does_match_same_string(i, word_token.index)
&& !token_conflict_map.does_match_different_string(i, word_token.index) && !token_conflict_map.does_match_different_string(i, word_token.index)
{ {
debug!( info!(
"Keywords - add candidate {}", "Keywords - add candidate {}",
lexical_grammar.variables[i].name lexical_grammar.variables[i].name
); );
@ -354,17 +282,17 @@ fn identify_keywords(
None None
} }
}) })
.collect::<TokenSet>(); .collect();
// Exclude keyword candidates that shadow another keyword candidate. // Exclude keyword candidates that shadow another keyword candidate.
let keywords = keyword_candidates let keywords: TokenSet = keyword_candidates
.iter() .iter()
.filter(|token| { .filter(|token| {
for other_token in keyword_candidates.iter() { for other_token in keyword_candidates.iter() {
if other_token != *token if other_token != *token
&& token_conflict_map.does_match_same_string(other_token.index, token.index) && token_conflict_map.does_match_same_string(other_token.index, token.index)
{ {
debug!( info!(
"Keywords - exclude {} because it matches the same string as {}", "Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name, lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name lexical_grammar.variables[other_token.index].name
@ -374,7 +302,7 @@ fn identify_keywords(
} }
true true
}) })
.collect::<TokenSet>(); .collect();
// Exclude keyword candidates for which substituting the keyword capture // Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens. // token would introduce new lexical conflicts with other tokens.
@ -406,7 +334,7 @@ fn identify_keywords(
word_token.index, word_token.index,
other_index, other_index,
) { ) {
debug!( info!(
"Keywords - exclude {} because of conflict with {}", "Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name, lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name lexical_grammar.variables[other_index].name
@ -415,7 +343,7 @@ fn identify_keywords(
} }
} }
debug!( info!(
"Keywords - include {}", "Keywords - include {}",
lexical_grammar.variables[token.index].name, lexical_grammar.variables[token.index].name,
); );
@ -469,9 +397,9 @@ fn report_state_info<'a>(
for (i, state) in parse_table.states.iter().enumerate() { for (i, state) in parse_table.states.iter().enumerate() {
all_state_indices.insert(i); all_state_indices.insert(i);
let item_set = &parse_state_info[state.id]; let item_set = &parse_state_info[state.id];
for entry in &item_set.1.entries { for (item, _) in &item_set.1.entries {
if !entry.item.is_augmented() { if !item.is_augmented() {
symbols_with_state_indices[entry.item.variable_index as usize] symbols_with_state_indices[item.variable_index as usize]
.1 .1
.insert(i); .insert(i);
} }
@ -487,14 +415,14 @@ fn report_state_info<'a>(
.max() .max()
.unwrap(); .unwrap();
for (symbol, states) in &symbols_with_state_indices { for (symbol, states) in &symbols_with_state_indices {
info!( eprintln!(
"{:width$}\t{}", "{:width$}\t{}",
syntax_grammar.variables[symbol.index].name, syntax_grammar.variables[symbol.index].name,
states.len(), states.len(),
width = max_symbol_name_length width = max_symbol_name_length
); );
} }
info!(""); eprintln!();
let state_indices = if report_symbol_name == "*" { let state_indices = if report_symbol_name == "*" {
Some(&all_state_indices) Some(&all_state_indices)
@ -517,27 +445,22 @@ fn report_state_info<'a>(
for state_index in state_indices { for state_index in state_indices {
let id = parse_table.states[state_index].id; let id = parse_table.states[state_index].id;
let (preceding_symbols, item_set) = &parse_state_info[id]; let (preceding_symbols, item_set) = &parse_state_info[id];
info!("state index: {state_index}"); eprintln!("state index: {state_index}");
info!("state id: {id}"); eprintln!("state id: {id}");
info!( eprint!("symbol sequence:");
"symbol sequence: {}", for symbol in preceding_symbols {
preceding_symbols let name = if symbol.is_terminal() {
.iter() &lexical_grammar.variables[symbol.index].name
.map(|symbol| { } else if symbol.is_external() {
if symbol.is_terminal() { &syntax_grammar.external_tokens[symbol.index].name
lexical_grammar.variables[symbol.index].name.clone() } else {
} else if symbol.is_external() { &syntax_grammar.variables[symbol.index].name
syntax_grammar.external_tokens[symbol.index].name.clone() };
} else { eprint!(" {name}");
syntax_grammar.variables[symbol.index].name.clone() }
} eprintln!(
})
.collect::<Vec<_>>()
.join(" ")
);
info!(
"\nitems:\n{}", "\nitems:\n{}",
item::ParseItemSetDisplay(item_set, syntax_grammar, lexical_grammar), self::item::ParseItemSetDisplay(item_set, syntax_grammar, lexical_grammar,),
); );
} }
} }

View file

@ -1,11 +1,10 @@
use std::{cmp::Ordering, collections::HashSet, fmt}; use crate::generate::build_tables::item::TokenSetDisplay;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::{ use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition};
build_tables::item::TokenSetDisplay, use crate::generate::rules::TokenSet;
grammars::{LexicalGrammar, SyntaxGrammar}, use std::cmp::Ordering;
nfa::{CharacterSet, NfaCursor, NfaTransition}, use std::collections::HashSet;
rules::TokenSet, use std::fmt;
};
#[derive(Clone, Debug, Default, PartialEq, Eq)] #[derive(Clone, Debug, Default, PartialEq, Eq)]
struct TokenConflictStatus { struct TokenConflictStatus {
@ -28,7 +27,7 @@ pub struct TokenConflictMap<'a> {
impl<'a> TokenConflictMap<'a> { impl<'a> TokenConflictMap<'a> {
/// Create a token conflict map based on a lexical grammar, which describes the structure /// Create a token conflict map based on a lexical grammar, which describes the structure
/// of each token, and a `following_token` map, which indicates which tokens may be appear /// each token, and a `following_token` map, which indicates which tokens may be appear
/// immediately after each other token. /// immediately after each other token.
/// ///
/// This analyzes the possible kinds of overlap between each pair of tokens and stores /// This analyzes the possible kinds of overlap between each pair of tokens and stores
@ -145,7 +144,7 @@ impl<'a> TokenConflictMap<'a> {
} }
} }
impl fmt::Debug for TokenConflictMap<'_> { impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "TokenConflictMap {{")?; writeln!(f, "TokenConflictMap {{")?;
@ -373,11 +372,9 @@ fn compute_conflict_status(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::grammars::{Variable, VariableType};
grammars::{Variable, VariableType}, use crate::generate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
prepare_grammar::{expand_tokens, ExtractedLexicalGrammar}, use crate::generate::rules::{Precedence, Rule, Symbol};
rules::{Precedence, Rule, Symbol},
};
#[test] #[test]
fn test_starting_characters() { fn test_starting_characters() {

View file

@ -0,0 +1,133 @@
use std::ops::Range;
/// A set of characters represented as a balanced binary tree of comparisons.
/// This is used as an intermediate step in generating efficient code for
/// matching a given character set.
#[derive(PartialEq, Eq)]
pub enum CharacterTree {
Yes,
Compare {
value: char,
operator: Comparator,
consequence: Option<Box<CharacterTree>>,
alternative: Option<Box<CharacterTree>>,
},
}
#[derive(PartialEq, Eq)]
pub enum Comparator {
Less,
LessOrEqual,
Equal,
GreaterOrEqual,
}
impl CharacterTree {
pub fn from_ranges(ranges: &[Range<char>]) -> Option<Self> {
match ranges.len() {
0 => None,
1 => {
let range = &ranges[0];
if range.start == range.end {
Some(Self::Compare {
operator: Comparator::Equal,
value: range.start,
consequence: Some(Box::new(Self::Yes)),
alternative: None,
})
} else {
Some(Self::Compare {
operator: Comparator::GreaterOrEqual,
value: range.start,
consequence: Some(Box::new(Self::Compare {
operator: Comparator::LessOrEqual,
value: range.end,
consequence: Some(Box::new(Self::Yes)),
alternative: None,
})),
alternative: None,
})
}
}
len => {
let mid = len / 2;
let mid_range = &ranges[mid];
Some(Self::Compare {
operator: Comparator::Less,
value: mid_range.start,
consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new),
alternative: Some(Box::new(Self::Compare {
operator: Comparator::LessOrEqual,
value: mid_range.end,
consequence: Some(Box::new(Self::Yes)),
alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new),
})),
})
}
}
}
#[cfg(test)]
fn contains(&self, c: char) -> bool {
match self {
Self::Yes => true,
Self::Compare {
value,
operator,
alternative,
consequence,
} => {
let condition = match operator {
Comparator::Less => c < *value,
Comparator::LessOrEqual => c <= *value,
Comparator::Equal => c == *value,
Comparator::GreaterOrEqual => c >= *value,
};
if condition { consequence } else { alternative }
.as_ref()
.map_or(false, |a| a.contains(c))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_character_tree_simple() {
let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z'])
.unwrap();
assert!(tree.contains('a'));
assert!(tree.contains('b'));
assert!(tree.contains('c'));
assert!(tree.contains('d'));
assert!(!tree.contains('e'));
assert!(!tree.contains('f'));
assert!(!tree.contains('g'));
assert!(tree.contains('h'));
assert!(tree.contains('i'));
assert!(tree.contains('j'));
assert!(tree.contains('k'));
assert!(tree.contains('l'));
assert!(!tree.contains('m'));
assert!(!tree.contains('n'));
assert!(!tree.contains('o'));
assert!(tree.contains('p'));
assert!(tree.contains('q'));
assert!(tree.contains('r'));
assert!(!tree.contains('s'));
assert!(!tree.contains('s'));
assert!(tree.contains('u'));
assert!(!tree.contains('v'));
}
}

View file

@ -3,7 +3,7 @@ pub fn split_state_id_groups<S>(
state_ids_by_group_id: &mut Vec<Vec<usize>>, state_ids_by_group_id: &mut Vec<Vec<usize>>,
group_ids_by_state_id: &mut [usize], group_ids_by_state_id: &mut [usize],
start_group_id: usize, start_group_id: usize,
mut should_split: impl FnMut(&S, &S, &[usize]) -> bool, mut f: impl FnMut(&S, &S, &[usize]) -> bool,
) -> bool { ) -> bool {
let mut result = false; let mut result = false;
@ -33,7 +33,7 @@ pub fn split_state_id_groups<S>(
} }
let right_state = &states[right_state_id]; let right_state = &states[right_state_id];
if should_split(left_state, right_state, group_ids_by_state_id) { if f(left_state, right_state, group_ids_by_state_id) {
split_state_ids.push(right_state_id); split_state_ids.push(right_state_id);
} }

View file

@ -16,7 +16,6 @@ function alias(rule, value) {
result.value = value.symbol.name; result.value = value.symbol.name;
return result; return result;
case Object: case Object:
case GrammarSymbol:
if (typeof value.type === 'string' && value.type === 'SYMBOL') { if (typeof value.type === 'string' && value.type === 'SYMBOL') {
result.named = true; result.named = true;
result.value = value.name; result.value = value.name;
@ -24,7 +23,7 @@ function alias(rule, value) {
} }
} }
throw new Error(`Invalid alias value ${value}`); throw new Error('Invalid alias value ' + value);
} }
function blank() { function blank() {
@ -36,7 +35,7 @@ function blank() {
function field(name, rule) { function field(name, rule) {
return { return {
type: "FIELD", type: "FIELD",
name, name: name,
content: normalize(rule) content: normalize(rule)
} }
} }
@ -49,14 +48,13 @@ function choice(...elements) {
} }
function optional(value) { function optional(value) {
checkArguments(arguments, arguments.length, optional, 'optional'); checkArguments(arguments.length, optional, 'optional');
return choice(value, blank()); return choice(value, blank());
} }
function prec(number, rule) { function prec(number, rule) {
checkPrecedence(number); checkPrecedence(number);
checkArguments( checkArguments(
arguments,
arguments.length - 1, arguments.length - 1,
prec, prec,
'prec', 'prec',
@ -70,7 +68,7 @@ function prec(number, rule) {
}; };
} }
prec.left = function (number, rule) { prec.left = function(number, rule) {
if (rule == null) { if (rule == null) {
rule = number; rule = number;
number = 0; number = 0;
@ -78,7 +76,6 @@ prec.left = function (number, rule) {
checkPrecedence(number); checkPrecedence(number);
checkArguments( checkArguments(
arguments,
arguments.length - 1, arguments.length - 1,
prec.left, prec.left,
'prec.left', 'prec.left',
@ -92,7 +89,7 @@ prec.left = function (number, rule) {
}; };
} }
prec.right = function (number, rule) { prec.right = function(number, rule) {
if (rule == null) { if (rule == null) {
rule = number; rule = number;
number = 0; number = 0;
@ -100,7 +97,6 @@ prec.right = function (number, rule) {
checkPrecedence(number); checkPrecedence(number);
checkArguments( checkArguments(
arguments,
arguments.length - 1, arguments.length - 1,
prec.right, prec.right,
'prec.right', 'prec.right',
@ -114,10 +110,9 @@ prec.right = function (number, rule) {
}; };
} }
prec.dynamic = function (number, rule) { prec.dynamic = function(number, rule) {
checkPrecedence(number); checkPrecedence(number);
checkArguments( checkArguments(
arguments,
arguments.length - 1, arguments.length - 1,
prec.dynamic, prec.dynamic,
'prec.dynamic', 'prec.dynamic',
@ -132,7 +127,7 @@ prec.dynamic = function (number, rule) {
} }
function repeat(rule) { function repeat(rule) {
checkArguments(arguments, arguments.length, repeat, 'repeat'); checkArguments(arguments.length, repeat, 'repeat');
return { return {
type: "REPEAT", type: "REPEAT",
content: normalize(rule) content: normalize(rule)
@ -140,7 +135,7 @@ function repeat(rule) {
} }
function repeat1(rule) { function repeat1(rule) {
checkArguments(arguments, arguments.length, repeat1, 'repeat1'); checkArguments(arguments.length, repeat1, 'repeat1');
return { return {
type: "REPEAT1", type: "REPEAT1",
content: normalize(rule) content: normalize(rule)
@ -154,38 +149,21 @@ function seq(...elements) {
}; };
} }
class GrammarSymbol {
constructor(name) {
this.type = "SYMBOL";
this.name = name;
}
}
function reserved(wordset, rule) {
if (typeof wordset !== 'string') {
throw new Error('Invalid reserved word set name: ' + wordset)
}
return {
type: "RESERVED",
content: normalize(rule),
context_name: wordset,
}
}
function sym(name) { function sym(name) {
return new GrammarSymbol(name); return {
type: "SYMBOL",
name: name
};
} }
function token(value) { function token(value) {
checkArguments(arguments, arguments.length, token, 'token', '', 'literal');
return { return {
type: "TOKEN", type: "TOKEN",
content: normalize(value) content: normalize(value)
}; };
} }
token.immediate = function (value) { token.immediate = function(value) {
checkArguments(arguments, arguments.length, token.immediate, 'token.immediate', '', 'literal');
return { return {
type: "IMMEDIATE_TOKEN", type: "IMMEDIATE_TOKEN",
content: normalize(value) content: normalize(value)
@ -211,28 +189,23 @@ function normalize(value) {
type: 'PATTERN', type: 'PATTERN',
value: value.source value: value.source
}; };
case RustRegex:
return {
type: 'PATTERN',
value: value.value
};
case ReferenceError: case ReferenceError:
throw value throw value
default: default:
if (typeof value.type === 'string') { if (typeof value.type === 'string') {
return value; return value;
} else { } else {
throw new TypeError(`Invalid rule: ${value}`); throw new TypeError("Invalid rule: " + value.toString());
} }
} }
} }
function RuleBuilder(ruleMap) { function RuleBuilder(ruleMap) {
return new Proxy({}, { return new Proxy({}, {
get(_, propertyName) { get(target, propertyName) {
const symbol = sym(propertyName); const symbol = sym(propertyName);
if (!ruleMap || Object.prototype.hasOwnProperty.call(ruleMap, propertyName)) { if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) {
return symbol; return symbol;
} else { } else {
const error = new ReferenceError(`Undefined symbol '${propertyName}'`); const error = new ReferenceError(`Undefined symbol '${propertyName}'`);
@ -244,8 +217,6 @@ function RuleBuilder(ruleMap) {
} }
function grammar(baseGrammar, options) { function grammar(baseGrammar, options) {
let inherits = undefined;
if (!options) { if (!options) {
options = baseGrammar; options = baseGrammar;
baseGrammar = { baseGrammar = {
@ -257,11 +228,9 @@ function grammar(baseGrammar, options) {
inline: [], inline: [],
supertypes: [], supertypes: [],
precedences: [], precedences: [],
reserved: {},
}; };
} else { } else {
baseGrammar = baseGrammar.grammar; baseGrammar = baseGrammar.grammar;
inherits = baseGrammar.name;
} }
let externals = baseGrammar.externals; let externals = baseGrammar.externals;
@ -281,10 +250,10 @@ function grammar(baseGrammar, options) {
} }
const ruleMap = {}; const ruleMap = {};
for (const key of Object.keys(options.rules)) { for (const key in options.rules) {
ruleMap[key] = true; ruleMap[key] = true;
} }
for (const key of Object.keys(baseGrammar.rules)) { for (const key in baseGrammar.rules) {
ruleMap[key] = true; ruleMap[key] = true;
} }
for (const external of externals) { for (const external of externals) {
@ -304,52 +273,18 @@ function grammar(baseGrammar, options) {
throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters."); throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters.");
} }
if (inherits && typeof inherits !== "string") { let rules = Object.assign({}, baseGrammar.rules);
throw new Error("Base grammar's 'name' property must be a string.");
}
if (inherits && !/^[a-zA-Z_]\w*$/.test(name)) {
throw new Error("Base grammar's 'name' property must not start with a digit and cannot contain non-word characters.");
}
const rules = Object.assign({}, baseGrammar.rules);
if (options.rules) { if (options.rules) {
if (typeof options.rules !== "object") { if (typeof options.rules !== "object") {
throw new Error("Grammar's 'rules' property must be an object."); throw new Error("Grammar's 'rules' property must be an object.");
} }
for (const ruleName of Object.keys(options.rules)) { for (const ruleName in options.rules) {
const ruleFn = options.rules[ruleName]; const ruleFn = options.rules[ruleName];
if (typeof ruleFn !== "function") { if (typeof ruleFn !== "function") {
throw new Error(`Grammar rules must all be functions. '${ruleName}' rule is not.`); throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not.");
} }
const rule = ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName]); rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName]));
if (rule === undefined) {
throw new Error(`Rule '${ruleName}' returned undefined.`);
}
rules[ruleName] = normalize(rule);
}
}
let reserved = baseGrammar.reserved;
if (options.reserved) {
if (typeof options.reserved !== "object") {
throw new Error("Grammar's 'reserved' property must be an object.");
}
for (const reservedWordSetName of Object.keys(options.reserved)) {
const reservedWordSetFn = options.reserved[reservedWordSetName]
if (typeof reservedWordSetFn !== "function") {
throw new Error(`Grammar reserved word sets must all be functions. '${reservedWordSetName}' is not.`);
}
const reservedTokens = reservedWordSetFn.call(ruleBuilder, ruleBuilder, baseGrammar.reserved[reservedWordSetName]);
if (!Array.isArray(reservedTokens)) {
throw new Error(`Grammar's reserved word set functions must all return arrays of rules. '${reservedWordSetName}' does not.`);
}
reserved[reservedWordSetName] = reservedTokens.map(normalize);
} }
} }
@ -442,12 +377,7 @@ function grammar(baseGrammar, options) {
throw new Error("Grammar's supertypes must be an array of rules."); throw new Error("Grammar's supertypes must be an array of rules.");
} }
supertypes = supertypeRules.map(symbol => { supertypes = supertypeRules.map(symbol => symbol.name);
if (symbol.name === 'ReferenceError') {
throw new Error(`Supertype rule \`${symbol.symbol.name}\` is not defined.`);
}
return symbol.name;
});
} }
let precedences = baseGrammar.precedences; let precedences = baseGrammar.precedences;
@ -467,43 +397,18 @@ function grammar(baseGrammar, options) {
}); });
} }
if (Object.keys(rules).length === 0) { if (Object.keys(rules).length == 0) {
throw new Error("Grammar must have at least one rule."); throw new Error("Grammar must have at least one rule.");
} }
return { return { grammar: { name, word, rules, extras, conflicts, precedences, externals, inline, supertypes } };
grammar: {
name,
inherits,
word,
rules,
extras,
conflicts,
precedences,
externals,
inline,
supertypes,
reserved,
},
};
} }
class RustRegex { function checkArguments(ruleCount, caller, callerName, suffix = '') {
constructor(value) { if (ruleCount > 1) {
this.value = value;
}
}
function checkArguments(args, ruleCount, caller, callerName, suffix = '', argType = 'rule') {
// Allow for .map() usage where additional arguments are index and the entire array.
const isMapCall = ruleCount === 3 && typeof args[1] === 'number' && Array.isArray(args[2]);
if (isMapCall) {
ruleCount = typeof args[2] === 'number' ? 1 : args[2].length;
}
if (ruleCount > 1 && !isMapCall) {
const error = new Error([ const error = new Error([
`The \`${callerName}\` function only takes one ${argType} argument${suffix}.`, `The \`${callerName}\` function only takes one rule argument${suffix}.`,
`You passed in multiple ${argType}s. Did you mean to call \`seq\`?\n` 'You passed multiple rules. Did you mean to call `seq`?\n'
].join('\n')); ].join('\n'));
Error.captureStackTrace(error, caller); Error.captureStackTrace(error, caller);
throw error throw error
@ -516,48 +421,18 @@ function checkPrecedence(value) {
} }
} }
function getEnv(name) { global.alias = alias;
if (globalThis.native) return globalThis.__ts_grammar_path; global.blank = blank;
if (globalThis.process) return process.env[name]; // Node/Bun global.choice = choice;
if (globalThis.Deno) return Deno.env.get(name); // Deno global.optional = optional;
throw Error("Unsupported JS runtime"); global.prec = prec;
} global.repeat = repeat;
global.repeat1 = repeat1;
global.seq = seq;
global.sym = sym;
global.token = token;
global.grammar = grammar;
global.field = field;
globalThis.alias = alias; const result = require(process.env.TREE_SITTER_GRAMMAR_PATH);
globalThis.blank = blank; process.stdout.write(JSON.stringify(result.grammar, null, null));
globalThis.choice = choice;
globalThis.optional = optional;
globalThis.prec = prec;
globalThis.repeat = repeat;
globalThis.repeat1 = repeat1;
globalThis.reserved = reserved;
globalThis.seq = seq;
globalThis.sym = sym;
globalThis.token = token;
globalThis.grammar = grammar;
globalThis.field = field;
globalThis.RustRegex = RustRegex;
const grammarPath = getEnv("TREE_SITTER_GRAMMAR_PATH");
let result = await import(grammarPath);
let grammarObj = result.default?.grammar ?? result.grammar;
if (globalThis.native && !grammarObj) {
grammarObj = module.exports.grammar;
}
const object = {
"$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/grammar.schema.json",
...grammarObj,
};
const output = JSON.stringify(object);
if (globalThis.native) {
globalThis.output = output;
} else if (globalThis.process) { // Node/Bun
process.stdout.write(output);
} else if (globalThis.Deno) { // Deno
Deno.stdout.writeSync(new TextEncoder().encode(output));
} else {
throw Error("Unsupported JS runtime");
}

View file

@ -1,6 +1,6 @@
{ {
"$schema": "http://json-schema.org/draft-07/schema#", "$schema": "http://json-schema.org/draft-07/schema#",
"title": "Tree-sitter grammar specification", "title": "tree-sitter grammar specification",
"type": "object", "type": "object",
"required": ["name", "rules"], "required": ["name", "rules"],
@ -8,18 +8,8 @@
"additionalProperties": false, "additionalProperties": false,
"properties": { "properties": {
"$schema": {
"type": "string"
},
"name": { "name": {
"description": "The name of the grammar", "description": "the name of the grammar",
"type": "string",
"pattern": "^[a-zA-Z_]\\w*"
},
"inherits": {
"description": "The name of the parent grammar",
"type": "string", "type": "string",
"pattern": "^[a-zA-Z_]\\w*" "pattern": "^[a-zA-Z_]\\w*"
}, },
@ -36,7 +26,6 @@
"extras": { "extras": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
} }
@ -44,36 +33,16 @@
"precedences": { "precedences": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"oneOf": [ "$ref": "#/definitions/rule"
{ "type": "string" },
{ "$ref": "#/definitions/symbol-rule" }
]
} }
} }
}, },
"reserved": {
"type": "object",
"patternProperties": {
"^[a-zA-Z_]\\w*$": {
"type": "array",
"uniqueItems": true,
"items": {
"$ref": "#/definitions/rule"
}
}
},
"additionalProperties": false
},
"externals": { "externals": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
} }
@ -81,7 +50,6 @@
"inline": { "inline": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"type": "string", "type": "string",
"pattern": "^[a-zA-Z_]\\w*$" "pattern": "^[a-zA-Z_]\\w*$"
@ -90,10 +58,8 @@
"conflicts": { "conflicts": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"type": "string", "type": "string",
"pattern": "^[a-zA-Z_]\\w*$" "pattern": "^[a-zA-Z_]\\w*$"
@ -107,11 +73,10 @@
}, },
"supertypes": { "supertypes": {
"description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types.", "description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types.",
"type": "array", "type": "array",
"uniqueItems": true,
"items": { "items": {
"description": "The name of a rule in `rules` or `extras`", "description": "the name of a rule in `rules` or `extras`",
"type": "string" "type": "string"
} }
} }
@ -123,7 +88,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "BLANK" "pattern": "^BLANK$"
} }
}, },
"required": ["type"] "required": ["type"]
@ -134,7 +99,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "STRING" "pattern": "^STRING$"
}, },
"value": { "value": {
"type": "string" "type": "string"
@ -148,10 +113,9 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "PATTERN" "pattern": "^PATTERN$"
}, },
"value": { "type": "string" }, "value": { "type": "string" }
"flags": { "type": "string" }
}, },
"required": ["type", "value"] "required": ["type", "value"]
}, },
@ -161,7 +125,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "SYMBOL" "pattern": "^SYMBOL$"
}, },
"name": { "type": "string" } "name": { "type": "string" }
}, },
@ -173,7 +137,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "SEQ" "pattern": "^SEQ$"
}, },
"members": { "members": {
"type": "array", "type": "array",
@ -190,7 +154,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "CHOICE" "pattern": "^CHOICE$"
}, },
"members": { "members": {
"type": "array", "type": "array",
@ -207,10 +171,14 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "ALIAS" "pattern": "^ALIAS$"
},
"value": {
"type": "string"
},
"named": {
"type": "boolean"
}, },
"value": { "type": "string" },
"named": { "type": "boolean" },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
} }
@ -223,7 +191,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "REPEAT" "pattern": "^REPEAT$"
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -237,7 +205,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"const": "REPEAT1" "pattern": "^REPEAT1$"
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -246,30 +214,12 @@
"required": ["type", "content"] "required": ["type", "content"]
}, },
"reserved-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "RESERVED"
},
"context_name": { "type": "string" },
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "context_name", "content"]
},
"token-rule": { "token-rule": {
"type": "object", "type": "object",
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"enum": [ "pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
"TOKEN",
"IMMEDIATE_TOKEN"
]
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -283,7 +233,7 @@
"name": { "type": "string" }, "name": { "type": "string" },
"type": { "type": {
"type": "string", "type": "string",
"const": "FIELD" "pattern": "^FIELD$"
}, },
"content": { "content": {
"$ref": "#/definitions/rule" "$ref": "#/definitions/rule"
@ -297,12 +247,7 @@
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"enum": [ "pattern": "^(PREC|PREC_LEFT|PREC_RIGHT|PREC_DYNAMIC)$"
"PREC",
"PREC_LEFT",
"PREC_RIGHT",
"PREC_DYNAMIC"
]
}, },
"value": { "value": {
"oneof": [ "oneof": [
@ -328,7 +273,6 @@
{ "$ref": "#/definitions/choice-rule" }, { "$ref": "#/definitions/choice-rule" },
{ "$ref": "#/definitions/repeat1-rule" }, { "$ref": "#/definitions/repeat1-rule" },
{ "$ref": "#/definitions/repeat-rule" }, { "$ref": "#/definitions/repeat-rule" },
{ "$ref": "#/definitions/reserved-rule" },
{ "$ref": "#/definitions/token-rule" }, { "$ref": "#/definitions/token-rule" },
{ "$ref": "#/definitions/field-rule" }, { "$ref": "#/definitions/field-rule" },
{ "$ref": "#/definitions/prec-rule" } { "$ref": "#/definitions/prec-rule" }

View file

@ -1,9 +1,7 @@
use std::{collections::HashMap, fmt}; use super::nfa::Nfa;
use super::rules::{Alias, Associativity, Precedence, Rule, Symbol};
use super::{ use std::collections::HashMap;
nfa::Nfa, use std::fmt;
rules::{Alias, Associativity, Precedence, Rule, Symbol, TokenSet},
};
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum VariableType { pub enum VariableType {
@ -39,13 +37,6 @@ pub struct InputGrammar {
pub variables_to_inline: Vec<String>, pub variables_to_inline: Vec<String>,
pub supertype_symbols: Vec<String>, pub supertype_symbols: Vec<String>,
pub word_token: Option<String>, pub word_token: Option<String>,
pub reserved_words: Vec<ReservedWordContext<Rule>>,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub struct ReservedWordContext<T> {
pub name: String,
pub reserved_words: Vec<T>,
} }
// Extracted lexical grammar // Extracted lexical grammar
@ -73,20 +64,8 @@ pub struct ProductionStep {
pub associativity: Option<Associativity>, pub associativity: Option<Associativity>,
pub alias: Option<Alias>, pub alias: Option<Alias>,
pub field_name: Option<String>, pub field_name: Option<String>,
pub reserved_word_set_id: ReservedWordSetId,
} }
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ReservedWordSetId(pub usize);
impl fmt::Display for ReservedWordSetId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
pub const NO_RESERVED_WORDS: ReservedWordSetId = ReservedWordSetId(usize::MAX);
#[derive(Clone, Debug, Default, PartialEq, Eq)] #[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Production { pub struct Production {
pub steps: Vec<ProductionStep>, pub steps: Vec<ProductionStep>,
@ -123,44 +102,50 @@ pub struct SyntaxGrammar {
pub variables_to_inline: Vec<Symbol>, pub variables_to_inline: Vec<Symbol>,
pub word_token: Option<Symbol>, pub word_token: Option<Symbol>,
pub precedence_orderings: Vec<Vec<PrecedenceEntry>>, pub precedence_orderings: Vec<Vec<PrecedenceEntry>>,
pub reserved_word_sets: Vec<TokenSet>,
} }
#[cfg(test)] #[cfg(test)]
impl ProductionStep { impl ProductionStep {
#[must_use] pub const fn new(symbol: Symbol) -> Self {
pub fn new(symbol: Symbol) -> Self {
Self { Self {
symbol, symbol,
precedence: Precedence::None, precedence: Precedence::None,
associativity: None, associativity: None,
alias: None, alias: None,
field_name: None, field_name: None,
reserved_word_set_id: ReservedWordSetId::default(),
} }
} }
pub fn with_prec( pub fn with_prec(self, precedence: Precedence, associativity: Option<Associativity>) -> Self {
mut self, Self {
precedence: Precedence, symbol: self.symbol,
associativity: Option<Associativity>, precedence,
) -> Self { associativity,
self.precedence = precedence; alias: self.alias,
self.associativity = associativity; field_name: self.field_name,
self }
} }
pub fn with_alias(mut self, value: &str, is_named: bool) -> Self { pub fn with_alias(self, value: &str, is_named: bool) -> Self {
self.alias = Some(Alias { Self {
value: value.to_string(), symbol: self.symbol,
is_named, precedence: self.precedence,
}); associativity: self.associativity,
self alias: Some(Alias {
value: value.to_string(),
is_named,
}),
field_name: self.field_name,
}
} }
pub fn with_field_name(self, name: &str) -> Self {
pub fn with_field_name(mut self, name: &str) -> Self { Self {
self.field_name = Some(name.to_string()); symbol: self.symbol,
self precedence: self.precedence,
associativity: self.associativity,
alias: self.alias,
field_name: Some(name.to_string()),
}
} }
} }
@ -253,7 +238,7 @@ impl InlinedProductionMap {
step_index: u32, step_index: u32,
) -> Option<impl Iterator<Item = &'a Production> + 'a> { ) -> Option<impl Iterator<Item = &'a Production> + 'a> {
self.production_map self.production_map
.get(&(std::ptr::from_ref::<Production>(production), step_index)) .get(&(production as *const Production, step_index))
.map(|production_indices| { .map(|production_indices| {
production_indices production_indices
.iter() .iter()

291
cli/src/generate/mod.rs Normal file
View file

@ -0,0 +1,291 @@
mod binding_files;
mod build_tables;
mod char_tree;
mod dedup;
mod grammars;
mod nfa;
mod node_types;
pub mod parse_grammar;
mod prepare_grammar;
mod render;
mod rules;
mod tables;
use std::io::Write;
use std::path::Path;
use std::process::{Command, Stdio};
use std::{env, fs};
use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use semver::Version;
use serde::Deserialize;
use self::build_tables::build_tables;
use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use self::parse_grammar::parse_grammar;
use self::prepare_grammar::prepare_grammar;
use self::render::render_c_code;
use self::rules::AliasMap;
lazy_static! {
static ref JSON_COMMENT_REGEX: Regex = RegexBuilder::new("^\\s*//.*")
.multi_line(true)
.build()
.unwrap();
}
struct GeneratedParser {
c_code: String,
node_types_json: String,
}
pub fn generate_parser_in_directory(
repo_path: &Path,
grammar_path: Option<&str>,
abi_version: usize,
generate_bindings: bool,
report_symbol_name: Option<&str>,
js_runtime: Option<&str>,
) -> Result<()> {
let src_path = repo_path.join("src");
let header_path = src_path.join("tree_sitter");
// Read the grammar.json.
let grammar_json = if let Some(path) = grammar_path {
load_grammar_file(path.as_ref(), js_runtime)?
} else {
let grammar_js_path =
grammar_path.map_or(repo_path.join("grammar.js"), std::convert::Into::into);
load_grammar_file(&grammar_js_path, js_runtime)?
};
// Ensure that the output directories exist.
fs::create_dir_all(&src_path)?;
fs::create_dir_all(&header_path)?;
if grammar_path.is_none() {
fs::write(src_path.join("grammar.json"), &grammar_json)
.with_context(|| format!("Failed to write grammar.json to {src_path:?}"))?;
}
// Parse and preprocess the grammar.
let input_grammar = parse_grammar(&grammar_json)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let language_name = input_grammar.name;
let language_semver = read_package_json_version()?;
let rust_binding_version = read_rust_binding_version()?;
if language_semver != rust_binding_version {
anyhow::bail!(
"Error:
The version of your language grammar in `package.json` is `{language_semver}`, but the version of your language grammar in `Cargo.toml` is `{rust_binding_version}`.
These versions must match. Please adjust one of these files to match the other, and then try running `tree-sitter generate` again.
Consider delegating this process to the `release` subcommand, which will handle git tags, GitHub releases, and publishing to crates.io, npmjs, and PyPI for you.
Read more here: https://tree-sitter.github.io/tree-sitter/creating-parsers#releasing-a-new-grammar-version",
);
}
// Generate the parser and related files.
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(
&language_name,
syntax_grammar,
lexical_grammar,
&inlines,
simple_aliases,
abi_version,
report_symbol_name,
(
language_semver.major as u8,
language_semver.minor as u8,
language_semver.patch as u8,
),
)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
if generate_bindings {
binding_files::generate_binding_files(repo_path, &language_name)?;
}
Ok(())
}
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
let input_grammar = parse_grammar(&grammar_json)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let parser = generate_parser_for_grammar_with_opts(
&input_grammar.name,
syntax_grammar,
lexical_grammar,
&inlines,
simple_aliases,
tree_sitter::LANGUAGE_VERSION,
None,
(0, 0, 0),
)?;
Ok((input_grammar.name, parser.c_code))
}
fn generate_parser_for_grammar_with_opts(
name: &str,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
inlines: &InlinedProductionMap,
simple_aliases: AliasMap,
abi_version: usize,
report_symbol_name: Option<&str>,
semantic_version: (u8, u8, u8),
) -> Result<GeneratedParser> {
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let node_types_json = node_types::generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
);
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
inlines,
report_symbol_name,
)?;
let c_code = render_c_code(
name,
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
syntax_grammar,
lexical_grammar,
simple_aliases,
abi_version,
semantic_version,
);
Ok(GeneratedParser {
c_code,
node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
})
}
fn read_package_json_version() -> Result<Version> {
#[derive(Deserialize)]
struct PackageJSON {
version: String,
}
let path = "package.json";
let text = fs::read_to_string(path).with_context(|| format!("Failed to read {path:?}"))?;
let package_json: PackageJSON =
serde_json::from_str(&text).with_context(|| format!("Failed to parse {path:?} as JSON"))?;
Ok(Version::parse(&package_json.version)?)
}
fn read_rust_binding_version() -> Result<Version> {
let path = "Cargo.toml";
let text = fs::read_to_string(path)?;
let cargo_toml = toml::from_str::<toml::Value>(text.as_ref())?;
Ok(Version::parse(
cargo_toml["package"]["version"].as_str().unwrap(),
)?)
}
pub fn load_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> Result<String> {
if grammar_path.is_dir() {
return Err(anyhow!(
"Path to a grammar file with `.js` or `.json` extension is required"
));
}
match grammar_path.extension().and_then(|e| e.to_str()) {
Some("js") => Ok(load_js_grammar_file(grammar_path, js_runtime)
.with_context(|| "Failed to load grammar.js")?),
Some("json") => {
Ok(fs::read_to_string(grammar_path).with_context(|| "Failed to load grammar.json")?)
}
_ => Err(anyhow!("Unknown grammar file extension: {grammar_path:?}",)),
}
}
fn load_js_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> Result<String> {
let grammar_path = fs::canonicalize(grammar_path)?;
let js_runtime = js_runtime.unwrap_or("node");
let mut node_process = Command::new(js_runtime)
.env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.with_context(|| format!("Failed to run `{js_runtime}`"))?;
let mut node_stdin = node_process
.stdin
.take()
.with_context(|| "Failed to open stdin for node")?;
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
.with_context(|| "Could not parse this package's version as semver.")?;
write!(
node_stdin,
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
global.TREE_SITTER_CLI_VERSION_MINOR = {};
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
cli_version.major, cli_version.minor, cli_version.patch,
)
.with_context(|| "Failed to write tree-sitter version to node's stdin")?;
let javascript_code = include_bytes!("./dsl.js");
node_stdin
.write(javascript_code)
.with_context(|| "Failed to write grammar dsl to node's stdin")?;
drop(node_stdin);
let output = node_process
.wait_with_output()
.with_context(|| "Failed to read output from node")?;
match output.status.code() {
None => panic!("Node process was killed"),
Some(0) => {
let stdout =
String::from_utf8(output.stdout).with_context(|| "Got invalid UTF8 from node")?;
let mut grammar_json = &stdout[..];
if let Some(pos) = stdout.rfind('\n') {
// If there's a newline, split the last line from the rest of the output
let node_output = &stdout[..pos];
grammar_json = &stdout[pos + 1..];
let mut stdout = std::io::stdout().lock();
stdout.write_all(node_output.as_bytes())?;
stdout.write_all(b"\n")?;
stdout.flush()?;
}
Ok(serde_json::to_string_pretty(
&serde_json::from_str::<serde_json::Value>(grammar_json)
.with_context(|| "Failed to parse grammar JSON")?,
)
.with_context(|| "Failed to serialize grammar JSON")?
+ "\n")
}
Some(code) => Err(anyhow!("Node process exited with status {code}")),
}
}
fn write_file(path: &Path, body: impl AsRef<[u8]>) -> Result<()> {
fs::write(path, body)
.with_context(|| format!("Failed to write {:?}", path.file_name().unwrap()))
}

View file

@ -1,14 +1,13 @@
use std::{ use std::char;
char, use std::cmp::max;
cmp::{max, Ordering}, use std::cmp::Ordering;
fmt, use std::collections::HashSet;
iter::ExactSizeIterator, use std::fmt;
mem::{self, swap}, use std::mem::swap;
ops::{Range, RangeInclusive}, use std::ops::Range;
};
/// A set of characters represented as a vector of ranges. /// A set of characters represented as a vector of ranges.
#[derive(Clone, Default, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub struct CharacterSet { pub struct CharacterSet {
ranges: Vec<Range<u32>>, ranges: Vec<Range<u32>>,
} }
@ -58,8 +57,7 @@ impl CharacterSet {
/// Create a character set with a given *inclusive* range of characters. /// Create a character set with a given *inclusive* range of characters.
#[allow(clippy::single_range_in_vec_init)] #[allow(clippy::single_range_in_vec_init)]
#[cfg(test)] pub fn from_range(mut first: char, mut last: char) -> Self {
fn from_range(mut first: char, mut last: char) -> Self {
if first > last { if first > last {
swap(&mut first, &mut last); swap(&mut first, &mut last);
} }
@ -116,11 +114,6 @@ impl CharacterSet {
self self
} }
pub fn assign(&mut self, other: &Self) {
self.ranges.clear();
self.ranges.extend_from_slice(&other.ranges);
}
fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize { fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize {
while i < self.ranges.len() { while i < self.ranges.len() {
let range = &mut self.ranges[i]; let range = &mut self.ranges[i];
@ -287,30 +280,17 @@ impl CharacterSet {
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or /// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
/// `other`, but is not present in both sets. /// `other`, but is not present in both sets.
#[cfg(test)] pub fn symmetric_difference(mut self, mut other: Self) -> Self {
fn symmetric_difference(mut self, mut other: Self) -> Self {
self.remove_intersection(&mut other); self.remove_intersection(&mut other);
self.add(&other) self.add(&other)
} }
pub fn char_codes(&self) -> impl Iterator<Item = u32> + '_ { pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
self.ranges.iter().flat_map(Clone::clone) self.ranges.iter().flat_map(std::clone::Clone::clone)
} }
pub fn chars(&self) -> impl Iterator<Item = char> + '_ { pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
self.char_codes().filter_map(char::from_u32) self.iter().filter_map(char::from_u32)
}
pub fn range_count(&self) -> usize {
self.ranges.len()
}
pub fn ranges(&self) -> impl Iterator<Item = RangeInclusive<char>> + '_ {
self.ranges.iter().filter_map(|range| {
let start = range.clone().find_map(char::from_u32)?;
let end = (range.start..range.end).rev().find_map(char::from_u32)?;
Some(start..=end)
})
} }
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
@ -319,57 +299,41 @@ impl CharacterSet {
/// Get a reduced list of character ranges, assuming that a given /// Get a reduced list of character ranges, assuming that a given
/// set of characters can be safely ignored. /// set of characters can be safely ignored.
pub fn simplify_ignoring(&self, ruled_out_characters: &Self) -> Self { pub fn simplify_ignoring<'a>(
let mut prev_range: Option<Range<u32>> = None; &'a self,
Self { ruled_out_characters: &'a HashSet<u32>,
ranges: self ) -> Vec<Range<char>> {
.ranges let mut prev_range: Option<Range<char>> = None;
.iter() self.chars()
.map(|range| Some(range.clone())) .map(|c| (c, false))
.chain([None]) .chain(Some(('\0', true)))
.filter_map(move |range| { .filter_map(move |(c, done)| {
if let Some(range) = &range { if done {
if ruled_out_characters.contains_codepoint_range(range.clone()) { return prev_range.clone();
return None; }
} if ruled_out_characters.contains(&(c as u32)) {
return None;
if let Some(prev_range) = &mut prev_range { }
if ruled_out_characters if let Some(range) = prev_range.clone() {
.contains_codepoint_range(prev_range.end..range.start) let mut prev_range_successor = range.end as u32 + 1;
{ while prev_range_successor < c as u32 {
prev_range.end = range.end; if !ruled_out_characters.contains(&prev_range_successor) {
return None; prev_range = Some(c..c);
} return Some(range);
} }
prev_range_successor += 1;
} }
prev_range = Some(range.start..c);
let result = prev_range.clone(); } else {
prev_range = range; prev_range = Some(c..c);
result }
}) None
.collect(), })
} .collect()
}
pub fn contains_codepoint_range(&self, seek_range: Range<u32>) -> bool {
let ix = match self.ranges.binary_search_by(|probe| {
if probe.end <= seek_range.start {
Ordering::Less
} else if probe.start > seek_range.start {
Ordering::Greater
} else {
Ordering::Equal
}
}) {
Ok(ix) | Err(ix) => ix,
};
self.ranges
.get(ix)
.is_some_and(|range| range.start <= seek_range.start && range.end >= seek_range.end)
} }
pub fn contains(&self, c: char) -> bool { pub fn contains(&self, c: char) -> bool {
self.contains_codepoint_range(c as u32..c as u32 + 1) self.ranges.iter().any(|r| r.contains(&(c as u32)))
} }
} }
@ -378,9 +342,15 @@ impl Ord for CharacterSet {
let count_cmp = self let count_cmp = self
.ranges .ranges
.iter() .iter()
.map(ExactSizeIterator::len) .map(std::iter::ExactSizeIterator::len)
.sum::<usize>() .sum::<usize>()
.cmp(&other.ranges.iter().map(ExactSizeIterator::len).sum()); .cmp(
&other
.ranges
.iter()
.map(std::iter::ExactSizeIterator::len)
.sum(),
);
if count_cmp != Ordering::Equal { if count_cmp != Ordering::Equal {
return count_cmp; return count_cmp;
} }
@ -416,11 +386,11 @@ impl fmt::Debug for CharacterSet {
write!(f, "^ ")?; write!(f, "^ ")?;
set = set.negate(); set = set.negate();
} }
for (i, range) in set.ranges().enumerate() { for (i, c) in set.chars().enumerate() {
if i > 0 { if i > 0 {
write!(f, ", ")?; write!(f, ", ")?;
} }
write!(f, "{range:?}")?; write!(f, "{c:?}")?;
} }
write!(f, "]")?; write!(f, "]")?;
Ok(()) Ok(())
@ -428,13 +398,11 @@ impl fmt::Debug for CharacterSet {
} }
impl Nfa { impl Nfa {
#[must_use]
pub const fn new() -> Self { pub const fn new() -> Self {
Self { states: Vec::new() } Self { states: Vec::new() }
} }
pub fn last_state_id(&self) -> u32 { pub fn last_state_id(&self) -> u32 {
assert!(!self.states.is_empty());
self.states.len() as u32 - 1 self.states.len() as u32 - 1
} }
} }
@ -496,7 +464,7 @@ impl<'a> NfaCursor<'a> {
fn group_transitions<'b>( fn group_transitions<'b>(
iter: impl Iterator<Item = (&'b CharacterSet, bool, i32, u32)>, iter: impl Iterator<Item = (&'b CharacterSet, bool, i32, u32)>,
) -> Vec<NfaTransition> { ) -> Vec<NfaTransition> {
let mut result = Vec::<NfaTransition>::new(); let mut result: Vec<NfaTransition> = Vec::new();
for (chars, is_sep, prec, state) in iter { for (chars, is_sep, prec, state) in iter {
let mut chars = chars.clone(); let mut chars = chars.clone();
let mut i = 0; let mut i = 0;
@ -531,25 +499,25 @@ impl<'a> NfaCursor<'a> {
}); });
} }
} }
result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters));
let mut i = 0; let mut i = 0;
while i < result.len() { 'i_loop: while i < result.len() {
for j in 0..i { for j in 0..i {
if result[j].states == result[i].states if result[j].states == result[i].states
&& result[j].is_separator == result[i].is_separator && result[j].is_separator == result[i].is_separator
&& result[j].precedence == result[i].precedence && result[j].precedence == result[i].precedence
{ {
let characters = mem::take(&mut result[j].characters); let mut characters = CharacterSet::empty();
swap(&mut characters, &mut result[j].characters);
result[j].characters = characters.add(&result[i].characters); result[j].characters = characters.add(&result[i].characters);
result.remove(i); result.remove(i);
i -= 1; continue 'i_loop;
break;
} }
} }
i += 1; i += 1;
} }
result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters));
result result
} }
@ -831,18 +799,18 @@ mod tests {
(CharacterSet::from_char('e'), false, 0, 2), (CharacterSet::from_char('e'), false, 0, 2),
], ],
vec![ vec![
NfaTransition {
characters: CharacterSet::empty().add_char('b').add_char('e'),
precedence: 0,
states: vec![2],
is_separator: false,
},
NfaTransition { NfaTransition {
characters: CharacterSet::empty().add_char('a').add_range('c', 'd'), characters: CharacterSet::empty().add_char('a').add_range('c', 'd'),
precedence: 0, precedence: 0,
states: vec![1], states: vec![1],
is_separator: false, is_separator: false,
}, },
NfaTransition {
characters: CharacterSet::empty().add_char('b').add_char('e'),
precedence: 0,
states: vec![2],
is_separator: false,
},
], ],
), ),
]; ];
@ -950,19 +918,20 @@ mod tests {
assert_eq!( assert_eq!(
left.remove_intersection(&mut right), left.remove_intersection(&mut right),
row.intersection, row.intersection,
"row {i}a: {:?} && {:?}", "row {}a: {:?} && {:?}",
i,
row.left, row.left,
row.right row.right
); );
assert_eq!( assert_eq!(
left, row.left_only, left, row.left_only,
"row {i}a: {:?} - {:?}", "row {}a: {:?} - {:?}",
row.left, row.right i, row.left, row.right
); );
assert_eq!( assert_eq!(
right, row.right_only, right, row.right_only,
"row {i}a: {:?} - {:?}", "row {}a: {:?} - {:?}",
row.right, row.left i, row.right, row.left
); );
let mut left = row.left.clone(); let mut left = row.left.clone();
@ -970,25 +939,27 @@ mod tests {
assert_eq!( assert_eq!(
right.remove_intersection(&mut left), right.remove_intersection(&mut left),
row.intersection, row.intersection,
"row {i}b: {:?} && {:?}", "row {}b: {:?} && {:?}",
i,
row.left, row.left,
row.right row.right
); );
assert_eq!( assert_eq!(
left, row.left_only, left, row.left_only,
"row {i}b: {:?} - {:?}", "row {}b: {:?} - {:?}",
row.left, row.right i, row.left, row.right
); );
assert_eq!( assert_eq!(
right, row.right_only, right, row.right_only,
"row {i}b: {:?} - {:?}", "row {}b: {:?} - {:?}",
row.right, row.left i, row.right, row.left
); );
assert_eq!( assert_eq!(
row.left.clone().difference(row.right.clone()), row.left.clone().difference(row.right.clone()),
row.left_only, row.left_only,
"row {i}b: {:?} -- {:?}", "row {}b: {:?} -- {:?}",
i,
row.left, row.left,
row.right row.right
); );
@ -1062,7 +1033,7 @@ mod tests {
#[test] #[test]
#[allow(clippy::single_range_in_vec_init)] #[allow(clippy::single_range_in_vec_init)]
fn test_character_set_simplify_ignoring() { fn test_character_set_get_ranges() {
struct Row { struct Row {
chars: Vec<char>, chars: Vec<char>,
ruled_out_chars: Vec<char>, ruled_out_chars: Vec<char>,
@ -1085,21 +1056,6 @@ mod tests {
ruled_out_chars: vec!['d', 'f', 'g'], ruled_out_chars: vec!['d', 'f', 'g'],
expected_ranges: vec!['a'..'h', 'z'..'z'], expected_ranges: vec!['a'..'h', 'z'..'z'],
}, },
Row {
chars: vec!['a', 'b', 'c', 'g', 'h', 'i'],
ruled_out_chars: vec!['d', 'j'],
expected_ranges: vec!['a'..'c', 'g'..'i'],
},
Row {
chars: vec!['c', 'd', 'e', 'g', 'h'],
ruled_out_chars: vec!['a', 'b', 'c', 'd', 'e', 'f'],
expected_ranges: vec!['g'..'h'],
},
Row {
chars: vec!['I', 'N'],
ruled_out_chars: vec!['A', 'I', 'N', 'Z'],
expected_ranges: vec![],
},
]; ];
for Row { for Row {
@ -1108,23 +1064,13 @@ mod tests {
expected_ranges, expected_ranges,
} in &table } in &table
{ {
let ruled_out_chars = ruled_out_chars let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect();
.iter()
.fold(CharacterSet::empty(), |set, c| set.add_char(*c));
let mut set = CharacterSet::empty(); let mut set = CharacterSet::empty();
for c in chars { for c in chars {
set = set.add_char(*c); set = set.add_char(*c);
} }
let actual = set.simplify_ignoring(&ruled_out_chars); let ranges = set.simplify_ignoring(&ruled_out_chars);
let expected = expected_ranges assert_eq!(ranges, *expected_ranges);
.iter()
.fold(CharacterSet::empty(), |set, range| {
set.add_range(range.start, range.end)
});
assert_eq!(
actual, expected,
"chars: {chars:?}, ruled out chars: {ruled_out_chars:?}"
);
} }
} }
} }

View file

@ -1,12 +1,9 @@
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use super::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use super::rules::{Alias, AliasMap, Symbol, SymbolType};
use anyhow::{anyhow, Result};
use serde::Serialize; use serde::Serialize;
use thiserror::Error; use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet};
use super::{
grammars::{LexicalGrammar, SyntaxGrammar, VariableType},
rules::{Alias, AliasMap, Symbol, SymbolType},
};
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum ChildType { pub enum ChildType {
@ -29,15 +26,10 @@ pub struct VariableInfo {
} }
#[derive(Debug, Serialize, PartialEq, Eq, Default, PartialOrd, Ord)] #[derive(Debug, Serialize, PartialEq, Eq, Default, PartialOrd, Ord)]
#[cfg(feature = "load")]
pub struct NodeInfoJSON { pub struct NodeInfoJSON {
#[serde(rename = "type")] #[serde(rename = "type")]
kind: String, kind: String,
named: bool, named: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
root: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
extra: bool,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
fields: Option<BTreeMap<String, FieldInfoJSON>>, fields: Option<BTreeMap<String, FieldInfoJSON>>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
@ -47,7 +39,6 @@ pub struct NodeInfoJSON {
} }
#[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg(feature = "load")]
pub struct NodeTypeJSON { pub struct NodeTypeJSON {
#[serde(rename = "type")] #[serde(rename = "type")]
kind: String, kind: String,
@ -55,7 +46,6 @@ pub struct NodeTypeJSON {
} }
#[derive(Debug, Serialize, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Serialize, PartialEq, Eq, PartialOrd, Ord)]
#[cfg(feature = "load")]
pub struct FieldInfoJSON { pub struct FieldInfoJSON {
multiple: bool, multiple: bool,
required: bool, required: bool,
@ -69,7 +59,6 @@ pub struct ChildQuantity {
multiple: bool, multiple: bool,
} }
#[cfg(feature = "load")]
impl Default for FieldInfoJSON { impl Default for FieldInfoJSON {
fn default() -> Self { fn default() -> Self {
Self { Self {
@ -105,7 +94,7 @@ impl ChildQuantity {
} }
} }
const fn append(&mut self, other: Self) { fn append(&mut self, other: Self) {
if other.exists { if other.exists {
if self.exists || other.multiple { if self.exists || other.multiple {
self.multiple = true; self.multiple = true;
@ -117,7 +106,7 @@ impl ChildQuantity {
} }
} }
const fn union(&mut self, other: Self) -> bool { fn union(&mut self, other: Self) -> bool {
let mut result = false; let mut result = false;
if !self.exists && other.exists { if !self.exists && other.exists {
result = true; result = true;
@ -135,14 +124,6 @@ impl ChildQuantity {
} }
} }
pub type VariableInfoResult<T> = Result<T, VariableInfoError>;
#[derive(Debug, Error, Serialize)]
pub enum VariableInfoError {
#[error("Grammar error: Supertype symbols must always have a single visible child, but `{0}` can have multiple")]
InvalidSupertype(String),
}
/// Compute a summary of the public-facing structure of each variable in the /// Compute a summary of the public-facing structure of each variable in the
/// grammar. Each variable in the grammar corresponds to a distinct public-facing /// grammar. Each variable in the grammar corresponds to a distinct public-facing
/// node type. /// node type.
@ -153,22 +134,23 @@ pub enum VariableInfoError {
/// * `types` - The types of visible children the field can contain. /// * `types` - The types of visible children the field can contain.
/// * `optional` - Do `N` nodes always have this field? /// * `optional` - Do `N` nodes always have this field?
/// * `multiple` - Can `N` nodes have multiple children for this field? /// * `multiple` - Can `N` nodes have multiple children for this field?
/// 3. `children_without_fields` - The *other* named children of `N` that are not associated with /// 3. `children_without_fields` - The *other* named children of `N` that are
/// fields. Data regarding these children: /// not associated with fields. Data regarding these children:
/// * `types` - The types of named children with no field. /// * `types` - The types of named children with no field.
/// * `optional` - Do `N` nodes always have at least one named child with no field? /// * `optional` - Do `N` nodes always have at least one named child with no field?
/// * `multiple` - Can `N` nodes have multiple named children with no field? /// * `multiple` - Can `N` nodes have multiple named children with no field?
/// ///
/// Each summary must account for some indirect factors: /// Each summary must account for some indirect factors:
/// 1. hidden nodes. When a parent node `N` has a hidden child `C`, the visible children of `C` /// 1. hidden nodes. When a parent node `N` has a hidden child `C`, the visible
/// *appear* to be direct children of `N`. /// children of `C` *appear* to be direct children of `N`.
/// 2. aliases. If a parent node type `M` is aliased as some other type `N`, then nodes which /// 2. aliases. If a parent node type `M` is aliased as some other type `N`,
/// *appear* to have type `N` may have internal structure based on `M`. /// then nodes which *appear* to have type `N` may have internal structure based
/// on `M`.
pub fn get_variable_info( pub fn get_variable_info(
syntax_grammar: &SyntaxGrammar, syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap, default_aliases: &AliasMap,
) -> VariableInfoResult<Vec<VariableInfo>> { ) -> Result<Vec<VariableInfo>> {
let child_type_is_visible = |t: &ChildType| { let child_type_is_visible = |t: &ChildType| {
variable_type_for_child_type(t, syntax_grammar, lexical_grammar) >= VariableType::Anonymous variable_type_for_child_type(t, syntax_grammar, lexical_grammar) >= VariableType::Anonymous
}; };
@ -236,8 +218,7 @@ pub fn get_variable_info(
.entry(field_name) .entry(field_name)
.or_insert_with(ChildQuantity::zero); .or_insert_with(ChildQuantity::zero);
// Inherit the types and quantities of hidden children associated with // Inherit the types and quantities of hidden children associated with fields.
// fields.
if child_is_hidden && child_symbol.is_non_terminal() { if child_is_hidden && child_symbol.is_non_terminal() {
let child_variable_info = &result[child_symbol.index]; let child_variable_info = &result[child_symbol.index];
did_change |= extend_sorted( did_change |= extend_sorted(
@ -349,7 +330,13 @@ pub fn get_variable_info(
for supertype_symbol in &syntax_grammar.supertype_symbols { for supertype_symbol in &syntax_grammar.supertype_symbols {
if result[supertype_symbol.index].has_multi_step_production { if result[supertype_symbol.index].has_multi_step_production {
let variable = &syntax_grammar.variables[supertype_symbol.index]; let variable = &syntax_grammar.variables[supertype_symbol.index];
Err(VariableInfoError::InvalidSupertype(variable.name.clone()))?; return Err(anyhow!(
concat!(
"Grammar error: Supertype symbols must always ",
"have a single visible child, but `{}` can have multiple"
),
variable.name
));
} }
} }
@ -374,105 +361,12 @@ pub fn get_variable_info(
Ok(result) Ok(result)
} }
fn get_aliases_by_symbol(
syntax_grammar: &SyntaxGrammar,
default_aliases: &AliasMap,
) -> HashMap<Symbol, BTreeSet<Option<Alias>>> {
let mut aliases_by_symbol = HashMap::new();
for (symbol, alias) in default_aliases {
aliases_by_symbol.insert(*symbol, {
let mut aliases = BTreeSet::new();
aliases.insert(Some(alias.clone()));
aliases
});
}
for extra_symbol in &syntax_grammar.extra_symbols {
if !default_aliases.contains_key(extra_symbol) {
aliases_by_symbol
.entry(*extra_symbol)
.or_insert_with(BTreeSet::new)
.insert(None);
}
}
for variable in &syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
aliases_by_symbol
.entry(step.symbol)
.or_insert_with(BTreeSet::new)
.insert(
step.alias
.as_ref()
.or_else(|| default_aliases.get(&step.symbol))
.cloned(),
);
}
}
}
aliases_by_symbol.insert(
Symbol::non_terminal(0),
std::iter::once(&None).cloned().collect(),
);
aliases_by_symbol
}
pub fn get_supertype_symbol_map(
syntax_grammar: &SyntaxGrammar,
default_aliases: &AliasMap,
variable_info: &[VariableInfo],
) -> BTreeMap<Symbol, Vec<ChildType>> {
let aliases_by_symbol = get_aliases_by_symbol(syntax_grammar, default_aliases);
let mut supertype_symbol_map = BTreeMap::new();
let mut symbols_by_alias = HashMap::new();
for (symbol, aliases) in &aliases_by_symbol {
for alias in aliases.iter().flatten() {
symbols_by_alias
.entry(alias)
.or_insert_with(Vec::new)
.push(*symbol);
}
}
for (i, info) in variable_info.iter().enumerate() {
let symbol = Symbol::non_terminal(i);
if syntax_grammar.supertype_symbols.contains(&symbol) {
let subtypes = info.children.types.clone();
supertype_symbol_map.insert(symbol, subtypes);
}
}
supertype_symbol_map
}
#[cfg(feature = "load")]
pub type SuperTypeCycleResult<T> = Result<T, SuperTypeCycleError>;
#[derive(Debug, Error, Serialize)]
pub struct SuperTypeCycleError {
items: Vec<String>,
}
impl std::fmt::Display for SuperTypeCycleError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Dependency cycle detected in node types:")?;
for (i, item) in self.items.iter().enumerate() {
write!(f, " {item}")?;
if i < self.items.len() - 1 {
write!(f, ",")?;
}
}
Ok(())
}
}
#[cfg(feature = "load")]
pub fn generate_node_types_json( pub fn generate_node_types_json(
syntax_grammar: &SyntaxGrammar, syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap, default_aliases: &AliasMap,
variable_info: &[VariableInfo], variable_info: &[VariableInfo],
) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> { ) -> Vec<NodeInfoJSON> {
let mut node_types_json = BTreeMap::new(); let mut node_types_json = BTreeMap::new();
let child_type_to_node_type = |child_type: &ChildType| match child_type { let child_type_to_node_type = |child_type: &ChildType| match child_type {
@ -528,32 +422,41 @@ pub fn generate_node_types_json(
} }
}; };
let aliases_by_symbol = get_aliases_by_symbol(syntax_grammar, default_aliases); let mut aliases_by_symbol = HashMap::new();
for (symbol, alias) in default_aliases {
let empty = BTreeSet::new(); aliases_by_symbol.insert(*symbol, {
let extra_names = syntax_grammar let mut aliases = HashSet::new();
.extra_symbols aliases.insert(Some(alias.clone()));
.iter() aliases
.flat_map(|symbol| { });
}
for extra_symbol in &syntax_grammar.extra_symbols {
if !default_aliases.contains_key(extra_symbol) {
aliases_by_symbol aliases_by_symbol
.get(symbol) .entry(*extra_symbol)
.unwrap_or(&empty) .or_insert_with(HashSet::new)
.iter() .insert(None);
.map(|alias| { }
alias.as_ref().map_or( }
match symbol.kind { for variable in &syntax_grammar.variables {
SymbolType::NonTerminal => &syntax_grammar.variables[symbol.index].name, for production in &variable.productions {
SymbolType::Terminal => &lexical_grammar.variables[symbol.index].name, for step in &production.steps {
SymbolType::External => { aliases_by_symbol
&syntax_grammar.external_tokens[symbol.index].name .entry(step.symbol)
} .or_insert_with(HashSet::new)
_ => unreachable!(), .insert(
}, step.alias
|alias| &alias.value, .as_ref()
) .or_else(|| default_aliases.get(&step.symbol))
}) .cloned(),
}) );
.collect::<HashSet<_>>(); }
}
}
aliases_by_symbol.insert(
Symbol::non_terminal(0),
std::iter::once(&None).cloned().collect(),
);
let mut subtype_map = Vec::new(); let mut subtype_map = Vec::new();
for (i, info) in variable_info.iter().enumerate() { for (i, info) in variable_info.iter().enumerate() {
@ -566,8 +469,6 @@ pub fn generate_node_types_json(
.or_insert_with(|| NodeInfoJSON { .or_insert_with(|| NodeInfoJSON {
kind: variable.name.clone(), kind: variable.name.clone(),
named: true, named: true,
root: false,
extra: extra_names.contains(&variable.name),
fields: None, fields: None,
children: None, children: None,
subtypes: None, subtypes: None,
@ -589,7 +490,10 @@ pub fn generate_node_types_json(
} else if !syntax_grammar.variables_to_inline.contains(&symbol) { } else if !syntax_grammar.variables_to_inline.contains(&symbol) {
// If a rule is aliased under multiple names, then its information // If a rule is aliased under multiple names, then its information
// contributes to multiple entries in the final JSON. // contributes to multiple entries in the final JSON.
for alias in aliases_by_symbol.get(&symbol).unwrap_or(&BTreeSet::new()) { for alias in aliases_by_symbol
.get(&Symbol::non_terminal(i))
.unwrap_or(&HashSet::new())
{
let kind; let kind;
let is_named; let is_named;
if let Some(alias) = alias { if let Some(alias) = alias {
@ -610,8 +514,6 @@ pub fn generate_node_types_json(
NodeInfoJSON { NodeInfoJSON {
kind: kind.clone(), kind: kind.clone(),
named: is_named, named: is_named,
root: i == 0,
extra: extra_names.contains(&kind),
fields: Some(BTreeMap::new()), fields: Some(BTreeMap::new()),
children: None, children: None,
subtypes: None, subtypes: None,
@ -621,8 +523,8 @@ pub fn generate_node_types_json(
let fields_json = node_type_json.fields.as_mut().unwrap(); let fields_json = node_type_json.fields.as_mut().unwrap();
for (new_field, field_info) in &info.fields { for (new_field, field_info) in &info.fields {
let field_json = fields_json.entry(new_field.clone()).or_insert_with(|| { let field_json = fields_json.entry(new_field.clone()).or_insert_with(|| {
// If another rule is aliased with the same name, and does *not* have this // If another rule is aliased with the same name, and does *not* have this field,
// field, then this field cannot be required. // then this field cannot be required.
let mut field_json = FieldInfoJSON::default(); let mut field_json = FieldInfoJSON::default();
if node_type_existed { if node_type_existed {
field_json.required = false; field_json.required = false;
@ -632,8 +534,8 @@ pub fn generate_node_types_json(
populate_field_info_json(field_json, field_info); populate_field_info_json(field_json, field_info);
} }
// If another rule is aliased with the same name, any fields that aren't present in // If another rule is aliased with the same name, any fields that aren't present in this
// this cannot be required. // cannot be required.
for (existing_field, field_json) in fields_json.iter_mut() { for (existing_field, field_json) in fields_json.iter_mut() {
if !info.fields.contains_key(existing_field) { if !info.fields.contains_key(existing_field) {
field_json.required = false; field_json.required = false;
@ -650,40 +552,22 @@ pub fn generate_node_types_json(
} }
} }
// Sort the subtype map topologically so that subtypes are listed before their supertypes. // Sort the subtype map so that subtypes are listed before their supertypes.
let mut sorted_kinds = Vec::with_capacity(subtype_map.len());
let mut top_sort = topological_sort::TopologicalSort::<String>::new();
for (supertype, subtypes) in &subtype_map {
for subtype in subtypes {
top_sort.add_dependency(subtype.kind.clone(), supertype.kind.clone());
}
}
loop {
let mut next_kinds = top_sort.pop_all();
match (next_kinds.is_empty(), top_sort.is_empty()) {
(true, true) => break,
(true, false) => {
let mut items = top_sort.collect::<Vec<String>>();
items.sort();
return Err(SuperTypeCycleError { items });
}
(false, _) => {
next_kinds.sort();
sorted_kinds.extend(next_kinds);
}
}
}
subtype_map.sort_by(|a, b| { subtype_map.sort_by(|a, b| {
let a_idx = sorted_kinds.iter().position(|n| n.eq(&a.0.kind)).unwrap(); if b.1.contains(&a.0) {
let b_idx = sorted_kinds.iter().position(|n| n.eq(&b.0.kind)).unwrap(); Ordering::Less
a_idx.cmp(&b_idx) } else if a.1.contains(&b.0) {
Ordering::Greater
} else {
Ordering::Equal
}
}); });
for node_type_json in node_types_json.values_mut() { for node_type_json in node_types_json.values_mut() {
if node_type_json if node_type_json
.children .children
.as_ref() .as_ref()
.is_some_and(|c| c.types.is_empty()) .map_or(false, |c| c.types.is_empty())
{ {
node_type_json.children = None; node_type_json.children = None;
} }
@ -700,6 +584,7 @@ pub fn generate_node_types_json(
let mut anonymous_node_types = Vec::new(); let mut anonymous_node_types = Vec::new();
let empty = HashSet::new();
let regular_tokens = lexical_grammar let regular_tokens = lexical_grammar
.variables .variables
.iter() .iter()
@ -737,18 +622,13 @@ pub fn generate_node_types_json(
for (name, kind) in regular_tokens.chain(external_tokens) { for (name, kind) in regular_tokens.chain(external_tokens) {
match kind { match kind {
VariableType::Named => { VariableType::Named => {
let node_type_json = let node_type_json = node_types_json.entry(name.clone()).or_insert(NodeInfoJSON {
node_types_json kind: name.clone(),
.entry(name.clone()) named: true,
.or_insert_with(|| NodeInfoJSON { fields: None,
kind: name.clone(), children: None,
named: true, subtypes: None,
root: false, });
extra: extra_names.contains(&name),
fields: None,
children: None,
subtypes: None,
});
if let Some(children) = &mut node_type_json.children { if let Some(children) = &mut node_type_json.children {
children.required = false; children.required = false;
} }
@ -761,8 +641,6 @@ pub fn generate_node_types_json(
VariableType::Anonymous => anonymous_node_types.push(NodeInfoJSON { VariableType::Anonymous => anonymous_node_types.push(NodeInfoJSON {
kind: name.clone(), kind: name.clone(),
named: false, named: false,
root: false,
extra: extra_names.contains(&name),
fields: None, fields: None,
children: None, children: None,
subtypes: None, subtypes: None,
@ -783,15 +661,11 @@ pub fn generate_node_types_json(
a_is_leaf.cmp(&b_is_leaf) a_is_leaf.cmp(&b_is_leaf)
}) })
.then_with(|| a.kind.cmp(&b.kind)) .then_with(|| a.kind.cmp(&b.kind))
.then_with(|| a.named.cmp(&b.named))
.then_with(|| a.root.cmp(&b.root))
.then_with(|| a.extra.cmp(&b.extra))
}); });
result.dedup(); result.dedup();
Ok(result) result
} }
#[cfg(feature = "load")]
fn process_supertypes(info: &mut FieldInfoJSON, subtype_map: &[(NodeTypeJSON, Vec<NodeTypeJSON>)]) { fn process_supertypes(info: &mut FieldInfoJSON, subtype_map: &[(NodeTypeJSON, Vec<NodeTypeJSON>)]) {
for (supertype, subtypes) in subtype_map { for (supertype, subtypes) in subtype_map {
if info.types.contains(supertype) { if info.types.contains(supertype) {
@ -828,26 +702,24 @@ fn extend_sorted<'a, T>(vec: &mut Vec<T>, values: impl IntoIterator<Item = &'a T
where where
T: 'a + Clone + Eq + Ord, T: 'a + Clone + Eq + Ord,
{ {
values.into_iter().fold(false, |acc, value| { values.into_iter().any(|value| {
if let Err(i) = vec.binary_search(value) { if let Err(i) = vec.binary_search(value) {
vec.insert(i, value.clone()); vec.insert(i, value.clone());
true true
} else { } else {
acc false
} }
}) })
} }
#[cfg(all(test, feature = "load"))] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::grammars::{
grammars::{ InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable,
InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable,
},
prepare_grammar::prepare_grammar,
rules::Rule,
}; };
use crate::generate::prepare_grammar::prepare_grammar;
use crate::generate::rules::Rule;
#[test] #[test]
fn test_node_types_simple() { fn test_node_types_simple() {
@ -875,8 +747,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!(node_types.len(), 3); assert_eq!(node_types.len(), 3);
@ -885,8 +756,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -924,8 +793,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: ";".to_string(), kind: ";".to_string(),
named: false, named: false,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -936,8 +803,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v2".to_string(), kind: "v2".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -965,9 +830,7 @@ mod tests {
}, },
// This rule is not reachable from the start symbol, but // This rule is not reachable from the start symbol, but
// it is reachable from the 'extra_symbols' so it // it is reachable from the 'extra_symbols' so it
// should be present in the node_types. // should be present in the node_types
// But because it's only a literal, it will get replaced by
// a lexical variable.
Variable { Variable {
name: "v3".to_string(), name: "v3".to_string(),
kind: VariableType::Named, kind: VariableType::Named,
@ -975,8 +838,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!(node_types.len(), 4); assert_eq!(node_types.len(), 4);
@ -985,8 +847,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -1024,8 +884,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: ";".to_string(), kind: ";".to_string(),
named: false, named: false,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -1036,8 +894,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v2".to_string(), kind: "v2".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -1048,120 +904,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v3".to_string(), kind: "v3".to_string(),
named: true, named: true,
root: false,
extra: true,
subtypes: None,
children: None,
fields: None
}
);
}
#[test]
fn test_node_types_deeper_extras() {
let node_types = get_node_types(&InputGrammar {
extra_symbols: vec![Rule::named("v3")],
variables: vec![
Variable {
name: "v1".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::field("f1".to_string(), Rule::named("v2")),
Rule::field("f2".to_string(), Rule::string(";")),
]),
},
Variable {
name: "v2".to_string(),
kind: VariableType::Named,
rule: Rule::string("x"),
},
// This rule is not reachable from the start symbol, but
// it is reachable from the 'extra_symbols' so it
// should be present in the node_types.
// Because it is not just a literal, it won't get replaced
// by a lexical variable.
Variable {
name: "v3".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![Rule::string("y"), Rule::repeat(Rule::string("z"))]),
},
],
..Default::default()
})
.unwrap();
assert_eq!(node_types.len(), 6);
assert_eq!(
node_types[0],
NodeInfoJSON {
kind: "v1".to_string(),
named: true,
root: true,
extra: false,
subtypes: None,
children: None,
fields: Some(
vec![
(
"f1".to_string(),
FieldInfoJSON {
multiple: false,
required: true,
types: vec![NodeTypeJSON {
kind: "v2".to_string(),
named: true,
}]
}
),
(
"f2".to_string(),
FieldInfoJSON {
multiple: false,
required: true,
types: vec![NodeTypeJSON {
kind: ";".to_string(),
named: false,
}]
}
),
]
.into_iter()
.collect()
)
}
);
assert_eq!(
node_types[1],
NodeInfoJSON {
kind: "v3".to_string(),
named: true,
root: false,
extra: true,
subtypes: None,
children: None,
fields: Some(BTreeMap::default())
}
);
assert_eq!(
node_types[2],
NodeInfoJSON {
kind: ";".to_string(),
named: false,
root: false,
extra: false,
subtypes: None,
children: None,
fields: None
}
);
assert_eq!(
node_types[3],
NodeInfoJSON {
kind: "v2".to_string(),
named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None fields: None
@ -1200,16 +942,13 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
NodeInfoJSON { NodeInfoJSON {
kind: "_v2".to_string(), kind: "_v2".to_string(),
named: true, named: true,
root: false,
extra: false,
fields: None, fields: None,
children: None, children: None,
subtypes: Some(vec![ subtypes: Some(vec![
@ -1233,8 +972,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -1290,16 +1027,13 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
NodeInfoJSON { NodeInfoJSON {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -1337,8 +1071,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "v2".to_string(), kind: "v2".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: false, multiple: false,
@ -1376,16 +1108,13 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
NodeInfoJSON { NodeInfoJSON {
kind: "v1".to_string(), kind: "v1".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -1451,8 +1180,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!(node_types.iter().find(|t| t.kind == "foo_identifier"), None); assert_eq!(node_types.iter().find(|t| t.kind == "foo_identifier"), None);
assert_eq!( assert_eq!(
@ -1460,8 +1188,6 @@ mod tests {
Some(&NodeInfoJSON { Some(&NodeInfoJSON {
kind: "identifier".to_string(), kind: "identifier".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None, fields: None,
@ -1472,8 +1198,6 @@ mod tests {
Some(&NodeInfoJSON { Some(&NodeInfoJSON {
kind: "type_identifier".to_string(), kind: "type_identifier".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: None, fields: None,
@ -1508,16 +1232,13 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types[0], node_types[0],
NodeInfoJSON { NodeInfoJSON {
kind: "a".to_string(), kind: "a".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -1558,16 +1279,13 @@ mod tests {
]), ]),
}], }],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types, node_types,
[NodeInfoJSON { [NodeInfoJSON {
kind: "script".to_string(), kind: "script".to_string(),
named: true, named: true,
root: true,
extra: false,
fields: Some(BTreeMap::new()), fields: Some(BTreeMap::new()),
children: None, children: None,
subtypes: None subtypes: None
@ -1607,8 +1325,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
&node_types &node_types
@ -1625,8 +1342,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "a".to_string(), kind: "a".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: None, children: None,
fields: Some( fields: Some(
@ -1682,8 +1397,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "script".to_string(), kind: "script".to_string(),
named: true, named: true,
root: true,
extra: false,
subtypes: None, subtypes: None,
// Only one node // Only one node
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
@ -1727,8 +1440,7 @@ mod tests {
}, },
], ],
..Default::default() ..Default::default()
}) });
.unwrap();
assert_eq!( assert_eq!(
node_types.iter().map(|n| &n.kind).collect::<Vec<_>>(), node_types.iter().map(|n| &n.kind).collect::<Vec<_>>(),
@ -1739,8 +1451,6 @@ mod tests {
NodeInfoJSON { NodeInfoJSON {
kind: "b".to_string(), kind: "b".to_string(),
named: true, named: true,
root: false,
extra: false,
subtypes: None, subtypes: None,
children: Some(FieldInfoJSON { children: Some(FieldInfoJSON {
multiple: true, multiple: true,
@ -2055,7 +1765,7 @@ mod tests {
); );
} }
fn get_node_types(grammar: &InputGrammar) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> { fn get_node_types(grammar: &InputGrammar) -> Vec<NodeInfoJSON> {
let (syntax_grammar, lexical_grammar, _, default_aliases) = let (syntax_grammar, lexical_grammar, _, default_aliases) =
prepare_grammar(grammar).unwrap(); prepare_grammar(grammar).unwrap();
let variable_info = let variable_info =

View file

@ -0,0 +1,238 @@
use super::grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType};
use super::rules::{Precedence, Rule};
use anyhow::{anyhow, Result};
use serde::Deserialize;
use serde_json::{Map, Value};
#[derive(Deserialize)]
#[serde(tag = "type")]
#[allow(non_camel_case_types)]
#[allow(clippy::upper_case_acronyms)]
enum RuleJSON {
ALIAS {
content: Box<RuleJSON>,
named: bool,
value: String,
},
BLANK,
STRING {
value: String,
},
PATTERN {
value: String,
flags: Option<String>,
},
SYMBOL {
name: String,
},
CHOICE {
members: Vec<RuleJSON>,
},
FIELD {
name: String,
content: Box<RuleJSON>,
},
SEQ {
members: Vec<RuleJSON>,
},
REPEAT {
content: Box<RuleJSON>,
},
REPEAT1 {
content: Box<RuleJSON>,
},
PREC_DYNAMIC {
value: i32,
content: Box<RuleJSON>,
},
PREC_LEFT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC_RIGHT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
TOKEN {
content: Box<RuleJSON>,
},
IMMEDIATE_TOKEN {
content: Box<RuleJSON>,
},
}
#[derive(Deserialize)]
#[serde(untagged)]
enum PrecedenceValueJSON {
Integer(i32),
Name(String),
}
#[derive(Deserialize)]
pub(crate) struct GrammarJSON {
pub(crate) name: String,
rules: Map<String, Value>,
#[serde(default)]
precedences: Vec<Vec<RuleJSON>>,
#[serde(default)]
conflicts: Vec<Vec<String>>,
#[serde(default)]
externals: Vec<RuleJSON>,
#[serde(default)]
extras: Vec<RuleJSON>,
#[serde(default)]
inline: Vec<String>,
#[serde(default)]
supertypes: Vec<String>,
word: Option<String>,
}
pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
let grammar_json: GrammarJSON = serde_json::from_str(input)?;
let mut variables = Vec::with_capacity(grammar_json.rules.len());
for (name, value) in grammar_json.rules {
variables.push(Variable {
name: name.clone(),
kind: VariableType::Named,
rule: parse_rule(serde_json::from_value(value)?),
});
}
let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
for list in grammar_json.precedences {
let mut ordering = Vec::with_capacity(list.len());
for entry in list {
ordering.push(match entry {
RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
_ => {
return Err(anyhow!(
"Invalid rule in precedences array. Only strings and symbols are allowed"
))
}
});
}
precedence_orderings.push(ordering);
}
let extra_symbols = grammar_json.extras.into_iter().map(parse_rule).collect();
let external_tokens = grammar_json.externals.into_iter().map(parse_rule).collect();
Ok(InputGrammar {
name: grammar_json.name,
word_token: grammar_json.word,
expected_conflicts: grammar_json.conflicts,
supertype_symbols: grammar_json.supertypes,
variables_to_inline: grammar_json.inline,
precedence_orderings,
variables,
extra_symbols,
external_tokens,
})
}
fn parse_rule(json: RuleJSON) -> Rule {
match json {
RuleJSON::ALIAS {
content,
value,
named,
} => Rule::alias(parse_rule(*content), value, named),
RuleJSON::BLANK => Rule::Blank,
RuleJSON::STRING { value } => Rule::String(value),
RuleJSON::PATTERN { value, flags } => Rule::Pattern(
value,
flags.map_or(String::new(), |f| {
f.chars()
.filter(|c| {
if *c == 'i' {
*c != 'u' // silently ignore unicode flag
} else {
eprintln!("Warning: unsupported flag {c}");
false
}
})
.collect()
}),
),
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
RuleJSON::FIELD { content, name } => Rule::field(name, parse_rule(*content)),
RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()),
RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)),
RuleJSON::REPEAT { content } => {
Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank])
}
RuleJSON::PREC { value, content } => Rule::prec(value.into(), parse_rule(*content)),
RuleJSON::PREC_LEFT { value, content } => {
Rule::prec_left(value.into(), parse_rule(*content))
}
RuleJSON::PREC_RIGHT { value, content } => {
Rule::prec_right(value.into(), parse_rule(*content))
}
RuleJSON::PREC_DYNAMIC { value, content } => {
Rule::prec_dynamic(value, parse_rule(*content))
}
RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)),
RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)),
}
}
impl From<PrecedenceValueJSON> for Precedence {
fn from(val: PrecedenceValueJSON) -> Self {
match val {
PrecedenceValueJSON::Integer(i) => Self::Integer(i),
PrecedenceValueJSON::Name(i) => Self::Name(i),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_grammar() {
let grammar = parse_grammar(
r#"{
"name": "my_lang",
"rules": {
"file": {
"type": "REPEAT1",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "STRING",
"value": "foo"
}
}
}"#,
)
.unwrap();
assert_eq!(grammar.name, "my_lang");
assert_eq!(
grammar.variables,
vec![
Variable {
name: "file".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
},
Variable {
name: "statement".to_string(),
kind: VariableType::Named,
rule: Rule::String("foo".to_string())
},
]
);
}
}

View file

@ -1,10 +1,8 @@
use std::{collections::HashMap, mem};
use super::ExtractedSyntaxGrammar; use super::ExtractedSyntaxGrammar;
use crate::{ use crate::generate::grammars::{Variable, VariableType};
grammars::{Variable, VariableType}, use crate::generate::rules::{Rule, Symbol};
rules::{Rule, Symbol}, use std::collections::HashMap;
}; use std::mem;
struct Expander { struct Expander {
variable_name: String, variable_name: String,
@ -59,7 +57,7 @@ impl Expander {
params: params.clone(), params: params.clone(),
}, },
// For repetitions, introduce an auxiliary rule that contains the // For repetitions, introduce an auxiliary rule that contains the the
// repeated content, but can also contain a recursive binary tree structure. // repeated content, but can also contain a recursive binary tree structure.
Rule::Repeat(content) => { Rule::Repeat(content) => {
let inner_rule = self.expand_rule(content); let inner_rule = self.expand_rule(content);

View file

@ -1,16 +1,35 @@
use regex_syntax::{
hir::{Class, Hir, HirKind},
ParserBuilder,
};
use serde::Serialize;
use thiserror::Error;
use super::ExtractedLexicalGrammar; use super::ExtractedLexicalGrammar;
use crate::{ use crate::generate::grammars::{LexicalGrammar, LexicalVariable};
grammars::{LexicalGrammar, LexicalVariable}, use crate::generate::nfa::{CharacterSet, Nfa, NfaState};
nfa::{CharacterSet, Nfa, NfaState}, use crate::generate::rules::{Precedence, Rule};
rules::{Precedence, Rule}, use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use regex_syntax::ast::{
parse, Ast, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem, ClassUnicodeKind,
RepetitionKind, RepetitionRange,
}; };
use std::collections::HashMap;
use std::i32;
lazy_static! {
static ref CURLY_BRACE_REGEX: Regex =
Regex::new(r"(^|[^\\pP])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}").unwrap();
static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
static ref UNICODE_CATEGORY_ALIASES: HashMap<&'static str, String> =
serde_json::from_str(UNICODE_CATEGORY_ALIASES_JSON).unwrap();
static ref UNICODE_PROPERTY_ALIASES: HashMap<&'static str, String> =
serde_json::from_str(UNICODE_PROPERTY_ALIASES_JSON).unwrap();
}
const UNICODE_CATEGORIES_JSON: &str = include_str!("./unicode-categories.json");
const UNICODE_PROPERTIES_JSON: &str = include_str!("./unicode-properties.json");
const UNICODE_CATEGORY_ALIASES_JSON: &str = include_str!("./unicode-category-aliases.json");
const UNICODE_PROPERTY_ALIASES_JSON: &str = include_str!("./unicode-property-aliases.json");
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
struct NfaBuilder { struct NfaBuilder {
nfa: Nfa, nfa: Nfa,
@ -18,40 +37,6 @@ struct NfaBuilder {
precedence_stack: Vec<i32>, precedence_stack: Vec<i32>,
} }
pub type ExpandTokensResult<T> = Result<T, ExpandTokensError>;
#[derive(Debug, Error, Serialize)]
pub enum ExpandTokensError {
#[error(
"The rule `{0}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
"
)]
EmptyString(String),
#[error(transparent)]
Processing(ExpandTokensProcessingError),
#[error(transparent)]
ExpandRule(ExpandRuleError),
}
#[derive(Debug, Error, Serialize)]
pub struct ExpandTokensProcessingError {
rule: String,
error: ExpandRuleError,
}
impl std::fmt::Display for ExpandTokensProcessingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(
f,
"Error processing rule {}: Grammar error: Unexpected rule {:?}",
self.rule, self.error
)?;
Ok(())
}
}
fn get_implicit_precedence(rule: &Rule) -> i32 { fn get_implicit_precedence(rule: &Rule) -> i32 {
match rule { match rule {
Rule::String(_) => 2, Rule::String(_) => 2,
@ -75,7 +60,30 @@ const fn get_completion_precedence(rule: &Rule) -> i32 {
0 0
} }
pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult<LexicalGrammar> { fn preprocess_regex(content: &str) -> String {
let content = CURLY_BRACE_REGEX.replace(content, "$1\\{$2\\}");
let mut result = String::with_capacity(content.len());
let mut is_escaped = false;
for c in content.chars() {
if is_escaped {
if !ALLOWED_REDUNDANT_ESCAPED_CHARS.contains(&c) {
result.push('\\');
}
result.push(c);
is_escaped = false;
} else if c == '\\' {
is_escaped = true;
} else {
result.push(c);
}
}
if is_escaped {
result.push('\\');
}
result
}
pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder { let mut builder = NfaBuilder {
nfa: Nfa::new(), nfa: Nfa::new(),
is_sep: true, is_sep: true,
@ -89,12 +97,8 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult
Rule::repeat(Rule::choice(grammar.separators)) Rule::repeat(Rule::choice(grammar.separators))
}; };
let mut variables = Vec::with_capacity(grammar.variables.len()); let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() { for (i, variable) in grammar.variables.into_iter().enumerate() {
if variable.rule.is_empty() {
Err(ExpandTokensError::EmptyString(variable.name.clone()))?;
}
let is_immediate_token = match &variable.rule { let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token, Rule::Metadata { params, .. } => params.is_main_token,
_ => false, _ => false,
@ -108,19 +112,12 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult
let last_state_id = builder.nfa.last_state_id(); let last_state_id = builder.nfa.last_state_id();
builder builder
.expand_rule(&variable.rule, last_state_id) .expand_rule(&variable.rule, last_state_id)
.map_err(|e| { .with_context(|| format!("Error processing rule {}", variable.name))?;
ExpandTokensError::Processing(ExpandTokensProcessingError {
rule: variable.name.clone(),
error: e,
})
})?;
if !is_immediate_token { if !is_immediate_token {
builder.is_sep = true; builder.is_sep = true;
let last_state_id = builder.nfa.last_state_id(); let last_state_id = builder.nfa.last_state_id();
builder builder.expand_rule(&separator_rule, last_state_id)?;
.expand_rule(&separator_rule, last_state_id)
.map_err(ExpandTokensError::ExpandRule)?;
} }
variables.push(LexicalVariable { variables.push(LexicalVariable {
@ -137,64 +134,23 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> ExpandTokensResult
}) })
} }
pub type ExpandRuleResult<T> = Result<T, ExpandRuleError>;
#[derive(Debug, Error, Serialize)]
pub enum ExpandRuleError {
#[error("Grammar error: Unexpected rule {0:?}")]
UnexpectedRule(Rule),
#[error("{0}")]
Parse(String),
#[error(transparent)]
ExpandRegex(ExpandRegexError),
}
pub type ExpandRegexResult<T> = Result<T, ExpandRegexError>;
#[derive(Debug, Error, Serialize)]
pub enum ExpandRegexError {
#[error("{0}")]
Utf8(String),
#[error("Regex error: Assertions are not supported")]
Assertion,
}
impl NfaBuilder { impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> ExpandRuleResult<bool> { fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule { match rule {
Rule::Pattern(s, f) => { Rule::Pattern(s, f) => {
// With unicode enabled, `\w`, `\s` and `\d` expand to character sets that are much let s = preprocess_regex(s);
// larger than intended, so we replace them with the actual let ast = parse::Parser::new().parse(&s)?;
// character sets they should represent. If the full unicode range self.expand_regex(&ast, next_state_id, f.contains('i'))
// of `\w`, `\s` or `\d` are needed then `\p{L}`, `\p{Z}` and `\p{N}` should be
// used.
let s = s
.replace(r"\w", r"[0-9A-Za-z_]")
.replace(r"\s", r"[\t-\r ]")
.replace(r"\d", r"[0-9]")
.replace(r"\W", r"[^0-9A-Za-z_]")
.replace(r"\S", r"[^\t-\r ]")
.replace(r"\D", r"[^0-9]");
let mut parser = ParserBuilder::new()
.case_insensitive(f.contains('i'))
.unicode(true)
.utf8(false)
.build();
let hir = parser
.parse(&s)
.map_err(|e| ExpandRuleError::Parse(e.to_string()))?;
self.expand_regex(&hir, next_state_id)
.map_err(ExpandRuleError::ExpandRegex)
} }
Rule::String(s) => { Rule::String(s) => {
for c in s.chars().rev() { for c in s.chars().rev() {
self.push_advance(CharacterSet::from_char(c), next_state_id); self.push_advance(CharacterSet::empty().add_char(c), next_state_id);
next_state_id = self.nfa.last_state_id(); next_state_id = self.nfa.last_state_id();
} }
Ok(!s.is_empty()) Ok(!s.is_empty())
} }
Rule::Choice(elements) => { Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::with_capacity(elements.len()); let mut alternative_state_ids = Vec::new();
for element in elements { for element in elements {
if self.expand_rule(element, next_state_id)? { if self.expand_rule(element, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id()); alternative_state_ids.push(self.nfa.last_state_id());
@ -248,98 +204,129 @@ impl NfaBuilder {
result result
} }
Rule::Blank => Ok(false), Rule::Blank => Ok(false),
_ => Err(ExpandRuleError::UnexpectedRule(rule.clone()))?, _ => Err(anyhow!("Grammar error: Unexpected rule {rule:?}")),
} }
} }
fn expand_regex(&mut self, hir: &Hir, mut next_state_id: u32) -> ExpandRegexResult<bool> { fn expand_regex(
match hir.kind() { &mut self,
HirKind::Empty => Ok(false), ast: &Ast,
HirKind::Literal(literal) => { mut next_state_id: u32,
for character in std::str::from_utf8(&literal.0) case_insensitive: bool,
.map_err(|e| ExpandRegexError::Utf8(e.to_string()))? ) -> Result<bool> {
.chars() const fn inverse_char(c: char) -> char {
.rev() match c {
{ 'a'..='z' => (c as u8 - b'a' + b'A') as char,
let char_set = CharacterSet::from_char(character); 'A'..='Z' => (c as u8 - b'A' + b'a') as char,
self.push_advance(char_set, next_state_id); c => c,
next_state_id = self.nfa.last_state_id(); }
} }
fn with_inverse_char(mut chars: CharacterSet) -> CharacterSet {
for char in chars.clone().chars() {
let inverted = inverse_char(char);
if char != inverted {
chars = chars.add_char(inverted);
}
}
chars
}
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(anyhow!("Regex error: Flags are not supported")),
Ast::Literal(literal) => {
let mut char_set = CharacterSet::from_char(literal.c);
if case_insensitive {
let inverted = inverse_char(literal.c);
if literal.c != inverted {
char_set = char_set.add_char(inverted);
}
}
self.push_advance(char_set, next_state_id);
Ok(true) Ok(true)
} }
HirKind::Class(class) => match class { Ast::Dot(_) => {
Class::Unicode(class) => { self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
let mut chars = CharacterSet::default(); Ok(true)
for c in class.ranges() { }
chars = chars.add_range(c.start(), c.end()); Ast::Assertion(_) => Err(anyhow!("Regex error: Assertions are not supported")),
} Ast::ClassUnicode(class) => {
let mut chars = self.expand_unicode_character_class(&class.kind)?;
// For some reason, the long s `ſ` is included if the letter `s` is in a if class.negated {
// pattern, so we remove it. chars = chars.negate();
if chars.range_count() == 3
&& chars
.ranges()
// exact check to ensure that `ſ` wasn't intentionally added.
.all(|r| ['s'..='s', 'S'..='S', 'ſ'..='ſ'].contains(&r))
{
chars = chars.difference(CharacterSet::from_char('ſ'));
}
self.push_advance(chars, next_state_id);
Ok(true)
} }
Class::Bytes(bytes_class) => { if case_insensitive {
let mut chars = CharacterSet::default(); chars = with_inverse_char(chars);
for c in bytes_class.ranges() {
chars = chars.add_range(c.start().into(), c.end().into());
}
self.push_advance(chars, next_state_id);
Ok(true)
} }
}, self.push_advance(chars, next_state_id);
HirKind::Look(_) => Err(ExpandRegexError::Assertion)?, Ok(true)
HirKind::Repetition(repetition) => match (repetition.min, repetition.max) { }
(0, Some(1)) => self.expand_zero_or_one(&repetition.sub, next_state_id), Ast::ClassPerl(class) => {
(1, None) => self.expand_one_or_more(&repetition.sub, next_state_id), let mut chars = self.expand_perl_character_class(&class.kind);
(0, None) => self.expand_zero_or_more(&repetition.sub, next_state_id), if class.negated {
(min, Some(max)) if min == max => { chars = chars.negate();
self.expand_count(&repetition.sub, min, next_state_id)
} }
(min, None) => { if case_insensitive {
if self.expand_zero_or_more(&repetition.sub, next_state_id)? { chars = with_inverse_char(chars);
self.expand_count(&repetition.sub, min, next_state_id) }
self.push_advance(chars, next_state_id);
Ok(true)
}
Ast::ClassBracketed(class) => {
let mut chars = self.translate_class_set(&class.kind)?;
if class.negated {
chars = chars.negate();
}
if case_insensitive {
chars = with_inverse_char(chars);
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id, case_insensitive)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id, case_insensitive)? {
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)
} else { } else {
Ok(false) Ok(false)
} }
} }
(min, Some(max)) => { RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.sub, min, next_state_id)?; let mut result =
self.expand_count(&repetition.ast, min, next_state_id, case_insensitive)?;
for _ in min..max { for _ in min..max {
if result { if result {
next_state_id = self.nfa.last_state_id(); next_state_id = self.nfa.last_state_id();
} }
if self.expand_zero_or_one(&repetition.sub, next_state_id)? { if self.expand_zero_or_one(
&repetition.ast,
next_state_id,
case_insensitive,
)? {
result = true; result = true;
} }
} }
Ok(result) Ok(result)
} }
}, },
HirKind::Capture(capture) => self.expand_regex(&capture.sub, next_state_id), Ast::Group(group) => self.expand_regex(&group.ast, next_state_id, case_insensitive),
HirKind::Concat(concat) => { Ast::Alternation(alternation) => {
let mut result = false; let mut alternative_state_ids = Vec::new();
for hir in concat.iter().rev() { for ast in &alternation.asts {
if self.expand_regex(hir, next_state_id)? { if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
HirKind::Alternation(alternations) => {
let mut alternative_state_ids = Vec::with_capacity(alternations.len());
for hir in alternations {
if self.expand_regex(hir, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id()); alternative_state_ids.push(self.nfa.last_state_id());
} else { } else {
alternative_state_ids.push(next_state_id); alternative_state_ids.push(next_state_id);
@ -348,21 +335,58 @@ impl NfaBuilder {
alternative_state_ids.sort_unstable(); alternative_state_ids.sort_unstable();
alternative_state_ids.dedup(); alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id()); alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids { for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id); self.push_split(alternative_state_id);
} }
Ok(true) Ok(true)
} }
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
} }
} }
fn expand_one_or_more(&mut self, hir: &Hir, next_state_id: u32) -> ExpandRegexResult<bool> { fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
match &class_set {
ClassSet::Item(item) => self.expand_character_class(item),
ClassSet::BinaryOp(binary_op) => {
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
match binary_op.kind {
ClassSetBinaryOpKind::Intersection => {
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
}
ClassSetBinaryOpKind::Difference => {
Ok(lhs_char_class.difference(rhs_char_class))
}
ClassSetBinaryOpKind::SymmetricDifference => {
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
}
}
}
}
}
fn expand_one_or_more(
&mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
self.nfa.states.push(NfaState::Accept { self.nfa.states.push(NfaState::Accept {
variable_index: 0, variable_index: 0,
precedence: 0, precedence: 0,
}); // Placeholder for split }); // Placeholder for split
let split_state_id = self.nfa.last_state_id(); let split_state_id = self.nfa.last_state_id();
if self.expand_regex(hir, split_state_id)? { if self.expand_regex(ast, split_state_id, case_insensitive)? {
self.nfa.states[split_state_id as usize] = self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id); NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true) Ok(true)
@ -372,8 +396,13 @@ impl NfaBuilder {
} }
} }
fn expand_zero_or_one(&mut self, hir: &Hir, next_state_id: u32) -> ExpandRegexResult<bool> { fn expand_zero_or_one(
if self.expand_regex(hir, next_state_id)? { &mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
if self.expand_regex(ast, next_state_id, case_insensitive)? {
self.push_split(next_state_id); self.push_split(next_state_id);
Ok(true) Ok(true)
} else { } else {
@ -381,8 +410,13 @@ impl NfaBuilder {
} }
} }
fn expand_zero_or_more(&mut self, hir: &Hir, next_state_id: u32) -> ExpandRegexResult<bool> { fn expand_zero_or_more(
if self.expand_one_or_more(hir, next_state_id)? { &mut self,
ast: &Ast,
next_state_id: u32,
case_insensitive: bool,
) -> Result<bool> {
if self.expand_one_or_more(ast, next_state_id, case_insensitive)? {
self.push_split(next_state_id); self.push_split(next_state_id);
Ok(true) Ok(true)
} else { } else {
@ -392,13 +426,14 @@ impl NfaBuilder {
fn expand_count( fn expand_count(
&mut self, &mut self,
hir: &Hir, ast: &Ast,
count: u32, count: u32,
mut next_state_id: u32, mut next_state_id: u32,
) -> ExpandRegexResult<bool> { case_insensitive: bool,
) -> Result<bool> {
let mut result = false; let mut result = false;
for _ in 0..count { for _ in 0..count {
if self.expand_regex(hir, next_state_id)? { if self.expand_regex(ast, next_state_id, case_insensitive)? {
result = true; result = true;
next_state_id = self.nfa.last_state_id(); next_state_id = self.nfa.last_state_id();
} }
@ -406,6 +441,111 @@ impl NfaBuilder {
Ok(result) Ok(result)
} }
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::empty()),
ClassSetItem::Literal(literal) => Ok(CharacterSet::from_char(literal.c)),
ClassSetItem::Range(range) => Ok(CharacterSet::from_range(range.start.c, range.end.c)),
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(&self.expand_character_class(item)?);
}
Ok(result)
}
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
ClassSetItem::Unicode(class) => {
let mut set = self.expand_unicode_character_class(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
ClassSetItem::Bracketed(class) => {
let mut set = self.translate_class_set(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
ClassSetItem::Ascii(_) => Err(anyhow!(
"Regex error: Unsupported character class syntax {item:?}",
)),
}
}
fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
let mut chars = CharacterSet::empty();
let category_letter;
match class {
ClassUnicodeKind::OneLetter(le) => {
category_letter = le.to_string();
}
ClassUnicodeKind::Named(class_name) => {
let actual_class_name = UNICODE_CATEGORY_ALIASES
.get(class_name.as_str())
.or_else(|| UNICODE_PROPERTY_ALIASES.get(class_name.as_str()))
.unwrap_or(class_name);
if actual_class_name.len() == 1 {
category_letter = actual_class_name.clone();
} else {
let code_points =
UNICODE_CATEGORIES
.get(actual_class_name.as_str())
.or_else(|| UNICODE_PROPERTIES.get(actual_class_name.as_str()))
.ok_or_else(|| {
anyhow!(
"Regex error: Unsupported unicode character class {class_name}",
)
})?;
for c in code_points {
if let Some(c) = std::char::from_u32(*c) {
chars = chars.add_char(c);
}
}
return Ok(chars);
}
}
ClassUnicodeKind::NamedValue { .. } => {
return Err(anyhow!(
"Regex error: Key-value unicode properties are not supported"
))
}
}
for (category, code_points) in UNICODE_CATEGORIES.iter() {
if category.starts_with(&category_letter) {
for c in code_points {
if let Some(c) = std::char::from_u32(*c) {
chars = chars.add_char(c);
}
}
}
}
Ok(chars)
}
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n')
.add_char('\x0B')
.add_char('\x0C'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
}
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) { fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
let precedence = *self.precedence_stack.last().unwrap(); let precedence = *self.precedence_stack.last().unwrap();
self.nfa.states.push(NfaState::Advance { self.nfa.states.push(NfaState::Advance {
@ -427,10 +567,8 @@ impl NfaBuilder {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::grammars::Variable;
grammars::Variable, use crate::generate::nfa::{NfaCursor, NfaTransition};
nfa::{NfaCursor, NfaTransition},
};
fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> { fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> {
let start_states = grammar.variables.iter().map(|v| v.start_state).collect(); let start_states = grammar.variables.iter().map(|v| v.start_state).collect();
@ -709,9 +847,11 @@ mod tests {
("\u{00df}", Some((3, "\u{00df}"))), ("\u{00df}", Some((3, "\u{00df}"))),
], ],
}, },
// allowing un-escaped curly braces
Row { Row {
rules: vec![ rules: vec![
Rule::pattern(r"u\{[0-9a-fA-F]+\}", ""), // Un-escaped curly braces
Rule::pattern(r"u{[0-9a-fA-F]+}", ""),
// Already-escaped curly braces // Already-escaped curly braces
Rule::pattern(r"\{[ab]{3}\}", ""), Rule::pattern(r"\{[ab]{3}\}", ""),
// Unicode codepoints // Unicode codepoints

View file

@ -1,7 +1,5 @@
use crate::{ use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
grammars::{LexicalGrammar, SyntaxGrammar}, use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType};
rules::{Alias, AliasMap, Symbol, SymbolType},
};
#[derive(Clone, Default)] #[derive(Clone, Default)]
struct SymbolStatus { struct SymbolStatus {
@ -16,8 +14,8 @@ struct SymbolStatus {
// This has two benefits: // This has two benefits:
// * It reduces the overhead of storing production-specific alias info in the parse table. // * It reduces the overhead of storing production-specific alias info in the parse table.
// * Within an `ERROR` node, no context-specific aliases will be applied. This transformation // * Within an `ERROR` node, no context-specific aliases will be applied. This transformation
// ensures that the children of an `ERROR` node have symbols that are consistent with the way that // ensures that the children of an `ERROR` node have symbols that are consistent with the
// they would appear in a valid syntax tree. // way that they would appear in a valid syntax tree.
pub(super) fn extract_default_aliases( pub(super) fn extract_default_aliases(
syntax_grammar: &mut SyntaxGrammar, syntax_grammar: &mut SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
@ -69,7 +67,9 @@ pub(super) fn extract_default_aliases(
SymbolType::External => &mut external_status_list[symbol.index], SymbolType::External => &mut external_status_list[symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[symbol.index], SymbolType::NonTerminal => &mut non_terminal_status_list[symbol.index],
SymbolType::Terminal => &mut terminal_status_list[symbol.index], SymbolType::Terminal => &mut terminal_status_list[symbol.index],
SymbolType::End | SymbolType::EndOfNonTerminalExtra => panic!("Unexpected end token"), SymbolType::End | SymbolType::EndOfNonTerminalExtra => {
panic!("Unexpected end token")
}
}; };
status.appears_unaliased = true; status.appears_unaliased = true;
} }
@ -162,10 +162,10 @@ pub(super) fn extract_default_aliases(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::grammars::{
grammars::{LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType}, LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType,
nfa::Nfa,
}; };
use crate::generate::nfa::Nfa;
#[test] #[test]
fn test_extract_simple_aliases() { fn test_extract_simple_aliases() {

View file

@ -1,82 +1,35 @@
use std::collections::HashMap;
use serde::Serialize;
use thiserror::Error;
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
use crate::{ use crate::generate::grammars::{ExternalToken, Variable, VariableType};
grammars::{ExternalToken, ReservedWordContext, Variable, VariableType}, use crate::generate::rules::{MetadataParams, Rule, Symbol, SymbolType};
rules::{MetadataParams, Rule, Symbol, SymbolType}, use anyhow::{anyhow, Result};
}; use std::collections::HashMap;
use std::mem;
pub type ExtractTokensResult<T> = Result<T, ExtractTokensError>;
#[derive(Debug, Error, Serialize)]
pub enum ExtractTokensError {
#[error(
"The rule `{0}` contains an empty string.
Tree-sitter does not support syntactic rules that contain an empty string
unless they are used only as the grammar's start rule.
"
)]
EmptyString(String),
#[error("Rule '{0}' cannot be used as both an external token and a non-terminal rule")]
ExternalTokenNonTerminal(String),
#[error("Non-symbol rules cannot be used as external tokens")]
NonSymbolExternalToken,
#[error(transparent)]
WordToken(NonTerminalWordTokenError),
#[error("Reserved word '{0}' must be a token")]
NonTokenReservedWord(String),
}
#[derive(Debug, Error, Serialize)]
pub struct NonTerminalWordTokenError {
pub symbol_name: String,
pub conflicting_symbol_name: Option<String>,
}
impl std::fmt::Display for NonTerminalWordTokenError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Non-terminal symbol '{}' cannot be used as the word token",
self.symbol_name
)?;
if let Some(conflicting_name) = &self.conflicting_symbol_name {
writeln!(
f,
", because its rule is duplicated in '{conflicting_name}'",
)
} else {
writeln!(f)
}
}
}
pub(super) fn extract_tokens( pub(super) fn extract_tokens(
mut grammar: InternedGrammar, mut grammar: InternedGrammar,
) -> ExtractTokensResult<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> { ) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> {
let mut extractor = TokenExtractor { let mut extractor = TokenExtractor {
current_variable_name: String::new(), current_variable_name: String::new(),
current_variable_token_count: 0, current_variable_token_count: 0,
is_first_rule: false,
extracted_variables: Vec::new(), extracted_variables: Vec::new(),
extracted_usage_counts: Vec::new(), extracted_usage_counts: Vec::new(),
}; };
for (i, variable) in &mut grammar.variables.iter_mut().enumerate() { for variable in &mut grammar.variables {
extractor.extract_tokens_in_variable(i == 0, variable)?; extractor.extract_tokens_in_variable(variable);
} }
for variable in &mut grammar.external_tokens { for variable in &mut grammar.external_tokens {
extractor.extract_tokens_in_variable(false, variable)?; extractor.extract_tokens_in_variable(variable);
} }
let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len()); let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len());
for variable in extractor.extracted_variables { for variable in extractor.extracted_variables {
lexical_variables.push(variable); lexical_variables.push(Variable {
name: variable.name,
kind: variable.kind,
rule: variable.rule,
});
} }
// If a variable's entire rule was extracted as a token and that token didn't // If a variable's entire rule was extracted as a token and that token didn't
@ -85,7 +38,7 @@ pub(super) fn extract_tokens(
// that pointed to that variable will need to be updated to point to the // that pointed to that variable will need to be updated to point to the
// variable in the lexical grammar. Symbols that pointed to later variables // variable in the lexical grammar. Symbols that pointed to later variables
// will need to have their indices decremented. // will need to have their indices decremented.
let mut variables = Vec::with_capacity(grammar.variables.len()); let mut variables = Vec::new();
let mut symbol_replacer = SymbolReplacer { let mut symbol_replacer = SymbolReplacer {
replacements: HashMap::new(), replacements: HashMap::new(),
}; };
@ -97,14 +50,10 @@ pub(super) fn extract_tokens(
{ {
if i > 0 && extractor.extracted_usage_counts[index] == 1 { if i > 0 && extractor.extracted_usage_counts[index] == 1 {
let lexical_variable = &mut lexical_variables[index]; let lexical_variable = &mut lexical_variables[index];
if lexical_variable.kind == VariableType::Auxiliary lexical_variable.kind = variable.kind;
|| variable.kind != VariableType::Hidden lexical_variable.name = variable.name;
{ symbol_replacer.replacements.insert(i, index);
lexical_variable.kind = variable.kind; continue;
lexical_variable.name = variable.name;
symbol_replacer.replacements.insert(i, index);
continue;
}
} }
} }
variables.push(variable); variables.push(variable);
@ -118,10 +67,10 @@ pub(super) fn extract_tokens(
.expected_conflicts .expected_conflicts
.into_iter() .into_iter()
.map(|conflict| { .map(|conflict| {
let mut result = conflict let mut result: Vec<_> = conflict
.iter() .iter()
.map(|symbol| symbol_replacer.replace_symbol(*symbol)) .map(|symbol| symbol_replacer.replace_symbol(*symbol))
.collect::<Vec<_>>(); .collect();
result.sort_unstable(); result.sort_unstable();
result.dedup(); result.dedup();
result result
@ -152,14 +101,15 @@ pub(super) fn extract_tokens(
} }
} }
let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len()); let mut external_tokens = Vec::new();
for external_token in grammar.external_tokens { for external_token in grammar.external_tokens {
let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule);
if let Rule::Symbol(symbol) = rule { if let Rule::Symbol(symbol) = rule {
if symbol.is_non_terminal() { if symbol.is_non_terminal() {
Err(ExtractTokensError::ExternalTokenNonTerminal( return Err(anyhow!(
variables[symbol.index].name.clone(), "Rule '{}' cannot be used as both an external token and a non-terminal rule",
))?; &variables[symbol.index].name,
));
} }
if symbol.is_external() { if symbol.is_external() {
@ -176,59 +126,22 @@ pub(super) fn extract_tokens(
}); });
} }
} else { } else {
Err(ExtractTokensError::NonSymbolExternalToken)?; return Err(anyhow!(
"Non-symbol rules cannot be used as external tokens"
));
} }
} }
let word_token = if let Some(token) = grammar.word_token { let mut word_token = None;
if let Some(token) = grammar.word_token {
let token = symbol_replacer.replace_symbol(token); let token = symbol_replacer.replace_symbol(token);
if token.is_non_terminal() { if token.is_non_terminal() {
let word_token_variable = &variables[token.index]; return Err(anyhow!(
let conflicting_symbol_name = variables "Non-terminal symbol '{}' cannot be used as the word token",
.iter() &variables[token.index].name
.enumerate() ));
.find(|(i, v)| *i != token.index && v.rule == word_token_variable.rule)
.map(|(_, v)| v.name.clone());
Err(ExtractTokensError::WordToken(NonTerminalWordTokenError {
symbol_name: word_token_variable.name.clone(),
conflicting_symbol_name,
}))?;
} }
Some(token) word_token = Some(token);
} else {
None
};
let mut reserved_word_contexts = Vec::with_capacity(grammar.reserved_word_sets.len());
for reserved_word_context in grammar.reserved_word_sets {
let mut reserved_words = Vec::with_capacity(reserved_word_contexts.len());
for reserved_rule in reserved_word_context.reserved_words {
if let Rule::Symbol(symbol) = reserved_rule {
reserved_words.push(symbol_replacer.replace_symbol(symbol));
} else if let Some(index) = lexical_variables
.iter()
.position(|v| v.rule == reserved_rule)
{
reserved_words.push(Symbol::terminal(index));
} else {
let rule = if let Rule::Metadata { rule, .. } = &reserved_rule {
rule.as_ref()
} else {
&reserved_rule
};
let token_name = match rule {
Rule::String(s) => s.clone(),
Rule::Pattern(p, _) => p.clone(),
_ => "unknown".to_string(),
};
Err(ExtractTokensError::NonTokenReservedWord(token_name))?;
}
}
reserved_word_contexts.push(ReservedWordContext {
name: reserved_word_context.name,
reserved_words,
});
} }
Ok(( Ok((
@ -241,7 +154,6 @@ pub(super) fn extract_tokens(
external_tokens, external_tokens,
word_token, word_token,
precedence_orderings: grammar.precedence_orderings, precedence_orderings: grammar.precedence_orderings,
reserved_word_sets: reserved_word_contexts,
}, },
ExtractedLexicalGrammar { ExtractedLexicalGrammar {
variables: lexical_variables, variables: lexical_variables,
@ -253,7 +165,6 @@ pub(super) fn extract_tokens(
struct TokenExtractor { struct TokenExtractor {
current_variable_name: String, current_variable_name: String,
current_variable_token_count: usize, current_variable_token_count: usize,
is_first_rule: bool,
extracted_variables: Vec<Variable>, extracted_variables: Vec<Variable>,
extracted_usage_counts: Vec<usize>, extracted_usage_counts: Vec<usize>,
} }
@ -263,33 +174,28 @@ struct SymbolReplacer {
} }
impl TokenExtractor { impl TokenExtractor {
fn extract_tokens_in_variable( fn extract_tokens_in_variable(&mut self, variable: &mut Variable) {
&mut self,
is_first: bool,
variable: &mut Variable,
) -> ExtractTokensResult<()> {
self.current_variable_name.clear(); self.current_variable_name.clear();
self.current_variable_name.push_str(&variable.name); self.current_variable_name.push_str(&variable.name);
self.current_variable_token_count = 0; self.current_variable_token_count = 0;
self.is_first_rule = is_first; let mut rule = Rule::Blank;
variable.rule = self.extract_tokens_in_rule(&variable.rule)?; mem::swap(&mut rule, &mut variable.rule);
Ok(()) variable.rule = self.extract_tokens_in_rule(&rule);
} }
fn extract_tokens_in_rule(&mut self, input: &Rule) -> ExtractTokensResult<Rule> { fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule {
match input { match input {
Rule::String(name) => Ok(self.extract_token(input, Some(name))?.into()), Rule::String(name) => self.extract_token(input, Some(name)).into(),
Rule::Pattern(..) => Ok(self.extract_token(input, None)?.into()), Rule::Pattern(..) => self.extract_token(input, None).into(),
Rule::Metadata { params, rule } => { Rule::Metadata { params, rule } => {
if params.is_token { if params.is_token {
let mut params = params.clone(); let mut params = params.clone();
params.is_token = false; params.is_token = false;
let string_value = if let Rule::String(value) = rule.as_ref() { let mut string_value = None;
Some(value) if let Rule::String(value) = rule.as_ref() {
} else { string_value = Some(value);
None }
};
let rule_to_extract = if params == MetadataParams::default() { let rule_to_extract = if params == MetadataParams::default() {
rule.as_ref() rule.as_ref()
@ -297,56 +203,41 @@ impl TokenExtractor {
input input
}; };
Ok(self.extract_token(rule_to_extract, string_value)?.into()) self.extract_token(rule_to_extract, string_value).into()
} else { } else {
Ok(Rule::Metadata { Rule::Metadata {
params: params.clone(), params: params.clone(),
rule: Box::new(self.extract_tokens_in_rule(rule)?), rule: Box::new(self.extract_tokens_in_rule(rule)),
}) }
} }
} }
Rule::Repeat(content) => Ok(Rule::Repeat(Box::new( Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))),
self.extract_tokens_in_rule(content)?, Rule::Seq(elements) => Rule::Seq(
))),
Rule::Seq(elements) => Ok(Rule::Seq(
elements elements
.iter() .iter()
.map(|e| self.extract_tokens_in_rule(e)) .map(|e| self.extract_tokens_in_rule(e))
.collect::<ExtractTokensResult<Vec<_>>>()?, .collect(),
)), ),
Rule::Choice(elements) => Ok(Rule::Choice( Rule::Choice(elements) => Rule::Choice(
elements elements
.iter() .iter()
.map(|e| self.extract_tokens_in_rule(e)) .map(|e| self.extract_tokens_in_rule(e))
.collect::<ExtractTokensResult<Vec<_>>>()?, .collect(),
)), ),
Rule::Reserved { rule, context_name } => Ok(Rule::Reserved { _ => input.clone(),
rule: Box::new(self.extract_tokens_in_rule(rule)?),
context_name: context_name.clone(),
}),
_ => Ok(input.clone()),
} }
} }
fn extract_token( fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol {
&mut self,
rule: &Rule,
string_value: Option<&String>,
) -> ExtractTokensResult<Symbol> {
for (i, variable) in self.extracted_variables.iter_mut().enumerate() { for (i, variable) in self.extracted_variables.iter_mut().enumerate() {
if variable.rule == *rule { if variable.rule == *rule {
self.extracted_usage_counts[i] += 1; self.extracted_usage_counts[i] += 1;
return Ok(Symbol::terminal(i)); return Symbol::terminal(i);
} }
} }
let index = self.extracted_variables.len(); let index = self.extracted_variables.len();
let variable = if let Some(string_value) = string_value { let variable = if let Some(string_value) = string_value {
if string_value.is_empty() && !self.is_first_rule {
Err(ExtractTokensError::EmptyString(
self.current_variable_name.clone(),
))?;
}
Variable { Variable {
name: string_value.clone(), name: string_value.clone(),
kind: VariableType::Anonymous, kind: VariableType::Anonymous,
@ -357,7 +248,7 @@ impl TokenExtractor {
Variable { Variable {
name: format!( name: format!(
"{}_token{}", "{}_token{}",
self.current_variable_name, self.current_variable_token_count &self.current_variable_name, self.current_variable_token_count
), ),
kind: VariableType::Auxiliary, kind: VariableType::Auxiliary,
rule: rule.clone(), rule: rule.clone(),
@ -366,7 +257,7 @@ impl TokenExtractor {
self.extracted_variables.push(variable); self.extracted_variables.push(variable);
self.extracted_usage_counts.push(1); self.extracted_usage_counts.push(1);
Ok(Symbol::terminal(index)) Symbol::terminal(index)
} }
} }
@ -391,10 +282,6 @@ impl SymbolReplacer {
params: params.clone(), params: params.clone(),
rule: Box::new(self.replace_symbols_in_rule(rule)), rule: Box::new(self.replace_symbols_in_rule(rule)),
}, },
Rule::Reserved { rule, context_name } => Rule::Reserved {
rule: Box::new(self.replace_symbols_in_rule(rule)),
context_name: context_name.clone(),
},
_ => rule.clone(), _ => rule.clone(),
} }
} }
@ -422,6 +309,7 @@ impl SymbolReplacer {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
use crate::generate::grammars::VariableType;
#[test] #[test]
fn test_extraction() { fn test_extraction() {
@ -590,48 +478,14 @@ mod test {
]); ]);
grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))]; grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))];
let result = extract_tokens(grammar); match extract_tokens(grammar) {
assert!(result.is_err(), "Expected an error but got no error"); Err(e) => {
let err = result.err().unwrap(); assert_eq!(e.to_string(), "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule");
assert_eq!( }
err.to_string(), _ => {
"Rule 'rule_1' cannot be used as both an external token and a non-terminal rule" panic!("Expected an error but got no error");
); }
} }
#[test]
fn test_extraction_on_hidden_terminal() {
let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::hidden("_rule_1", Rule::string("a")),
]))
.unwrap();
// The rule `_rule_1` should not "absorb" the
// terminal "a", since it is hidden,
// so we expect two variables still
assert_eq!(
syntax_grammar.variables,
vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::hidden("_rule_1", Rule::terminal(0)),
]
);
// We should not have a hidden rule in our lexical grammar, only the terminal "a"
assert_eq!(
lexical_grammar.variables,
vec![Variable::anonymous("a", Rule::string("a"))]
);
}
#[test]
fn test_extraction_with_empty_string() {
assert!(extract_tokens(build_grammar(vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::hidden("_rule_1", Rule::string("")),
]))
.is_err());
} }
fn build_grammar(variables: Vec<Variable>) -> InternedGrammar { fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {

View file

@ -1,96 +1,46 @@
use std::collections::HashMap;
use serde::Serialize;
use thiserror::Error;
use super::ExtractedSyntaxGrammar; use super::ExtractedSyntaxGrammar;
use crate::{ use crate::generate::grammars::{
grammars::{ Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable,
Production, ProductionStep, ReservedWordSetId, SyntaxGrammar, SyntaxVariable, Variable,
},
rules::{Alias, Associativity, Precedence, Rule, Symbol, TokenSet},
}; };
use crate::generate::rules::{Alias, Associativity, Precedence, Rule, Symbol};
pub type FlattenGrammarResult<T> = Result<T, FlattenGrammarError>; use anyhow::{anyhow, Result};
#[derive(Debug, Error, Serialize)]
pub enum FlattenGrammarError {
#[error("No such reserved word set: {0}")]
NoReservedWordSet(String),
#[error(
"The rule `{0}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
"
)]
EmptyString(String),
#[error("Rule `{0}` cannot be inlined because it contains a reference to itself")]
RecursiveInline(String),
}
struct RuleFlattener { struct RuleFlattener {
production: Production, production: Production,
reserved_word_set_ids: HashMap<String, ReservedWordSetId>,
precedence_stack: Vec<Precedence>, precedence_stack: Vec<Precedence>,
associativity_stack: Vec<Associativity>, associativity_stack: Vec<Associativity>,
reserved_word_stack: Vec<ReservedWordSetId>,
alias_stack: Vec<Alias>, alias_stack: Vec<Alias>,
field_name_stack: Vec<String>, field_name_stack: Vec<String>,
} }
impl RuleFlattener { impl RuleFlattener {
const fn new(reserved_word_set_ids: HashMap<String, ReservedWordSetId>) -> Self { fn new() -> Self {
Self { Self {
production: Production { production: Production {
steps: Vec::new(), steps: Vec::new(),
dynamic_precedence: 0, dynamic_precedence: 0,
}, },
reserved_word_set_ids,
precedence_stack: Vec::new(), precedence_stack: Vec::new(),
associativity_stack: Vec::new(), associativity_stack: Vec::new(),
reserved_word_stack: Vec::new(),
alias_stack: Vec::new(), alias_stack: Vec::new(),
field_name_stack: Vec::new(), field_name_stack: Vec::new(),
} }
} }
fn flatten_variable(&mut self, variable: Variable) -> FlattenGrammarResult<SyntaxVariable> { fn flatten(mut self, rule: Rule) -> Production {
let choices = extract_choices(variable.rule); self.apply(rule, true);
let mut productions = Vec::with_capacity(choices.len()); self.production
for rule in choices {
let production = self.flatten_rule(rule)?;
if !productions.contains(&production) {
productions.push(production);
}
}
Ok(SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
})
} }
fn flatten_rule(&mut self, rule: Rule) -> FlattenGrammarResult<Production> { fn apply(&mut self, rule: Rule, at_end: bool) -> bool {
self.production = Production::default();
self.alias_stack.clear();
self.reserved_word_stack.clear();
self.precedence_stack.clear();
self.associativity_stack.clear();
self.field_name_stack.clear();
self.apply(rule, true)?;
Ok(self.production.clone())
}
fn apply(&mut self, rule: Rule, at_end: bool) -> FlattenGrammarResult<bool> {
match rule { match rule {
Rule::Seq(members) => { Rule::Seq(members) => {
let mut result = false; let mut result = false;
let last_index = members.len() - 1; let last_index = members.len() - 1;
for (i, member) in members.into_iter().enumerate() { for (i, member) in members.into_iter().enumerate() {
result |= self.apply(member, i == last_index && at_end)?; result |= self.apply(member, i == last_index && at_end);
} }
Ok(result) result
} }
Rule::Metadata { rule, params } => { Rule::Metadata { rule, params } => {
let mut has_precedence = false; let mut has_precedence = false;
@ -121,7 +71,7 @@ impl RuleFlattener {
self.production.dynamic_precedence = params.dynamic_precedence; self.production.dynamic_precedence = params.dynamic_precedence;
} }
let did_push = self.apply(*rule, at_end)?; let did_push = self.apply(*rule, at_end);
if has_precedence { if has_precedence {
self.precedence_stack.pop(); self.precedence_stack.pop();
@ -150,20 +100,7 @@ impl RuleFlattener {
self.field_name_stack.pop(); self.field_name_stack.pop();
} }
Ok(did_push) did_push
}
Rule::Reserved { rule, context_name } => {
self.reserved_word_stack.push(
self.reserved_word_set_ids
.get(&context_name)
.copied()
.ok_or_else(|| {
FlattenGrammarError::NoReservedWordSet(context_name.clone())
})?,
);
let did_push = self.apply(*rule, at_end)?;
self.reserved_word_stack.pop();
Ok(did_push)
} }
Rule::Symbol(symbol) => { Rule::Symbol(symbol) => {
self.production.steps.push(ProductionStep { self.production.steps.push(ProductionStep {
@ -174,17 +111,12 @@ impl RuleFlattener {
.cloned() .cloned()
.unwrap_or(Precedence::None), .unwrap_or(Precedence::None),
associativity: self.associativity_stack.last().copied(), associativity: self.associativity_stack.last().copied(),
reserved_word_set_id: self
.reserved_word_stack
.last()
.copied()
.unwrap_or(ReservedWordSetId::default()),
alias: self.alias_stack.last().cloned(), alias: self.alias_stack.last().cloned(),
field_name: self.field_name_stack.last().cloned(), field_name: self.field_name_stack.last().cloned(),
}); });
Ok(true) true
} }
_ => Ok(false), _ => false,
} }
} }
} }
@ -195,7 +127,7 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
let mut result = vec![Rule::Blank]; let mut result = vec![Rule::Blank];
for element in elements { for element in elements {
let extraction = extract_choices(element); let extraction = extract_choices(element);
let mut next_result = Vec::with_capacity(result.len()); let mut next_result = Vec::new();
for entry in result { for entry in result {
for extraction_entry in &extraction { for extraction_entry in &extraction {
next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()])); next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()]));
@ -206,7 +138,7 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
result result
} }
Rule::Choice(elements) => { Rule::Choice(elements) => {
let mut result = Vec::with_capacity(elements.len()); let mut result = Vec::new();
for element in elements { for element in elements {
for rule in extract_choices(element) { for rule in extract_choices(element) {
result.push(rule); result.push(rule);
@ -221,17 +153,25 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
params: params.clone(), params: params.clone(),
}) })
.collect(), .collect(),
Rule::Reserved { rule, context_name } => extract_choices(*rule)
.into_iter()
.map(|rule| Rule::Reserved {
rule: Box::new(rule),
context_name: context_name.clone(),
})
.collect(),
_ => vec![rule], _ => vec![rule],
} }
} }
fn flatten_variable(variable: Variable) -> SyntaxVariable {
let mut productions = Vec::new();
for rule in extract_choices(variable.rule) {
let production = RuleFlattener::new().flatten(rule);
if !productions.contains(&production) {
productions.push(production);
}
}
SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
}
}
fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool { fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool {
for variable in variables { for variable in variables {
for production in &variable.productions { for production in &variable.productions {
@ -245,48 +185,25 @@ fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool {
false false
} }
pub(super) fn flatten_grammar( pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxGrammar> {
grammar: ExtractedSyntaxGrammar, let mut variables = Vec::new();
) -> FlattenGrammarResult<SyntaxGrammar> { for variable in grammar.variables {
let mut reserved_word_set_ids_by_name = HashMap::new(); variables.push(flatten_variable(variable));
for (ix, set) in grammar.reserved_word_sets.iter().enumerate() {
reserved_word_set_ids_by_name.insert(set.name.clone(), ReservedWordSetId(ix));
} }
let mut flattener = RuleFlattener::new(reserved_word_set_ids_by_name);
let variables = grammar
.variables
.into_iter()
.map(|variable| flattener.flatten_variable(variable))
.collect::<FlattenGrammarResult<Vec<_>>>()?;
for (i, variable) in variables.iter().enumerate() { for (i, variable) in variables.iter().enumerate() {
let symbol = Symbol::non_terminal(i);
let used = symbol_is_used(&variables, symbol);
for production in &variable.productions { for production in &variable.productions {
if used && production.steps.is_empty() { if production.steps.is_empty() && symbol_is_used(&variables, Symbol::non_terminal(i)) {
Err(FlattenGrammarError::EmptyString(variable.name.clone()))?; return Err(anyhow!(
} "The rule `{}` matches the empty string.
if grammar.variables_to_inline.contains(&symbol) Tree-sitter does not support syntactic rules that match the empty string
&& production.steps.iter().any(|step| step.symbol == symbol) unless they are used only as the grammar's start rule.
{ ",
Err(FlattenGrammarError::RecursiveInline(variable.name.clone()))?; variable.name
));
} }
} }
} }
let mut reserved_word_sets = grammar
.reserved_word_sets
.into_iter()
.map(|set| set.reserved_words.into_iter().collect())
.collect::<Vec<_>>();
// If no default reserved word set is specified, there are no reserved words.
if reserved_word_sets.is_empty() {
reserved_word_sets.push(TokenSet::default());
}
Ok(SyntaxGrammar { Ok(SyntaxGrammar {
extra_symbols: grammar.extra_symbols, extra_symbols: grammar.extra_symbols,
expected_conflicts: grammar.expected_conflicts, expected_conflicts: grammar.expected_conflicts,
@ -295,7 +212,6 @@ pub(super) fn flatten_grammar(
external_tokens: grammar.external_tokens, external_tokens: grammar.external_tokens,
supertype_symbols: grammar.supertype_symbols, supertype_symbols: grammar.supertype_symbols,
word_token: grammar.word_token, word_token: grammar.word_token,
reserved_word_sets,
variables, variables,
}) })
} }
@ -303,35 +219,33 @@ pub(super) fn flatten_grammar(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::grammars::VariableType; use crate::generate::grammars::VariableType;
use crate::generate::rules::Symbol;
#[test] #[test]
fn test_flatten_grammar() { fn test_flatten_grammar() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::seq(vec![
kind: VariableType::Named, Rule::non_terminal(1),
rule: Rule::seq(vec![ Rule::prec_left(
Rule::non_terminal(1), Precedence::Integer(101),
Rule::prec_left( Rule::seq(vec![
Precedence::Integer(101), Rule::non_terminal(2),
Rule::seq(vec![ Rule::choice(vec![
Rule::non_terminal(2), Rule::prec_right(
Rule::choice(vec![ Precedence::Integer(102),
Rule::prec_right( Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
Precedence::Integer(102), ),
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), Rule::non_terminal(5),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]), ]),
), Rule::non_terminal(6),
Rule::non_terminal(7), ]),
]), ),
}) Rule::non_terminal(7),
.unwrap(); ]),
});
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -368,31 +282,28 @@ mod tests {
#[test] #[test]
fn test_flatten_grammar_with_maximum_dynamic_precedence() { fn test_flatten_grammar_with_maximum_dynamic_precedence() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::seq(vec![
kind: VariableType::Named, Rule::non_terminal(1),
rule: Rule::seq(vec![ Rule::prec_dynamic(
Rule::non_terminal(1), 101,
Rule::prec_dynamic( Rule::seq(vec![
101, Rule::non_terminal(2),
Rule::seq(vec![ Rule::choice(vec![
Rule::non_terminal(2), Rule::prec_dynamic(
Rule::choice(vec![ 102,
Rule::prec_dynamic( Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
102, ),
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]), Rule::non_terminal(5),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]), ]),
), Rule::non_terminal(6),
Rule::non_terminal(7), ]),
]), ),
}) Rule::non_terminal(7),
.unwrap(); ]),
});
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -424,17 +335,14 @@ mod tests {
#[test] #[test]
fn test_flatten_grammar_with_final_precedence() { fn test_flatten_grammar_with_final_precedence() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::prec_left(
kind: VariableType::Named, Precedence::Integer(101),
rule: Rule::prec_left( Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
Precedence::Integer(101), ),
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]), });
),
})
.unwrap();
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -449,16 +357,14 @@ mod tests {
}] }]
); );
let result = flattener let result = flatten_variable(Variable {
.flatten_variable(Variable { name: "test".to_string(),
name: "test".to_string(), kind: VariableType::Named,
kind: VariableType::Named, rule: Rule::prec_left(
rule: Rule::prec_left( Precedence::Integer(101),
Precedence::Integer(101), Rule::seq(vec![Rule::non_terminal(1)]),
Rule::seq(vec![Rule::non_terminal(1)]), ),
), });
})
.unwrap();
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -472,21 +378,18 @@ mod tests {
#[test] #[test]
fn test_flatten_grammar_with_field_names() { fn test_flatten_grammar_with_field_names() {
let mut flattener = RuleFlattener::new(HashMap::default()); let result = flatten_variable(Variable {
let result = flattener name: "test".to_string(),
.flatten_variable(Variable { kind: VariableType::Named,
name: "test".to_string(), rule: Rule::seq(vec![
kind: VariableType::Named, Rule::field("first-thing".to_string(), Rule::terminal(1)),
rule: Rule::seq(vec![ Rule::terminal(2),
Rule::field("first-thing".to_string(), Rule::terminal(1)), Rule::choice(vec![
Rule::terminal(2), Rule::Blank,
Rule::choice(vec![ Rule::field("second-thing".to_string(), Rule::terminal(3)),
Rule::Blank,
Rule::field("second-thing".to_string(), Rule::terminal(3)),
]),
]), ]),
}) ]),
.unwrap(); });
assert_eq!( assert_eq!(
result.productions, result.productions,
@ -509,32 +412,4 @@ mod tests {
] ]
); );
} }
#[test]
fn test_flatten_grammar_with_recursive_inline_variable() {
let result = flatten_grammar(ExtractedSyntaxGrammar {
extra_symbols: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: vec![Symbol::non_terminal(0)],
precedence_orderings: Vec::new(),
external_tokens: Vec::new(),
supertype_symbols: Vec::new(),
word_token: None,
reserved_word_sets: Vec::new(),
variables: vec![Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(0),
Rule::non_terminal(1),
Rule::non_terminal(2),
]),
}],
});
assert_eq!(
result.unwrap_err().to_string(),
"Rule `test` cannot be inlined because it contains a reference to itself",
);
}
} }

View file

@ -1,34 +1,13 @@
use log::warn;
use serde::Serialize;
use thiserror::Error;
use super::InternedGrammar; use super::InternedGrammar;
use crate::{ use crate::generate::grammars::{InputGrammar, Variable, VariableType};
grammars::{InputGrammar, ReservedWordContext, Variable, VariableType}, use crate::generate::rules::{Rule, Symbol};
rules::{Rule, Symbol}, use anyhow::{anyhow, Result};
};
pub type InternSymbolsResult<T> = Result<T, InternSymbolsError>; pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar> {
#[derive(Debug, Error, Serialize)]
pub enum InternSymbolsError {
#[error("A grammar's start rule must be visible.")]
HiddenStartRule,
#[error("Undefined symbol `{0}`")]
Undefined(String),
#[error("Undefined symbol `{0}` in grammar's supertypes array")]
UndefinedSupertype(String),
#[error("Undefined symbol `{0}` in grammar's conflicts array")]
UndefinedConflict(String),
#[error("Undefined symbol `{0}` as grammar's word token")]
UndefinedWordToken(String),
}
pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<InternedGrammar> {
let interner = Interner { grammar }; let interner = Interner { grammar };
if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden { if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden {
Err(InternSymbolsError::HiddenStartRule)?; return Err(anyhow!("A grammar's start rule must be visible."));
} }
let mut variables = Vec::with_capacity(grammar.variables.len()); let mut variables = Vec::with_capacity(grammar.variables.len());
@ -36,13 +15,13 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
variables.push(Variable { variables.push(Variable {
name: variable.name.clone(), name: variable.name.clone(),
kind: variable_type_for_name(&variable.name), kind: variable_type_for_name(&variable.name),
rule: interner.intern_rule(&variable.rule, Some(&variable.name))?, rule: interner.intern_rule(&variable.rule)?,
}); });
} }
let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len()); let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len());
for external_token in &grammar.external_tokens { for external_token in &grammar.external_tokens {
let rule = interner.intern_rule(external_token, None)?; let rule = interner.intern_rule(external_token)?;
let (name, kind) = if let Rule::NamedSymbol(name) = external_token { let (name, kind) = if let Rule::NamedSymbol(name) = external_token {
(name.clone(), variable_type_for_name(name)) (name.clone(), variable_type_for_name(name))
} else { } else {
@ -53,36 +32,26 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len()); let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len());
for extra_token in &grammar.extra_symbols { for extra_token in &grammar.extra_symbols {
extra_symbols.push(interner.intern_rule(extra_token, None)?); extra_symbols.push(interner.intern_rule(extra_token)?);
} }
let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len()); let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
for supertype_symbol_name in &grammar.supertype_symbols { for supertype_symbol_name in &grammar.supertype_symbols {
supertype_symbols.push(interner.intern_name(supertype_symbol_name).ok_or_else(|| { supertype_symbols.push(
InternSymbolsError::UndefinedSupertype(supertype_symbol_name.clone()) interner
})?); .intern_name(supertype_symbol_name)
.ok_or_else(|| anyhow!("Undefined symbol `{supertype_symbol_name}`"))?,
);
} }
let mut reserved_words = Vec::with_capacity(grammar.reserved_words.len()); let mut expected_conflicts = Vec::new();
for reserved_word_set in &grammar.reserved_words {
let mut interned_set = Vec::with_capacity(reserved_word_set.reserved_words.len());
for rule in &reserved_word_set.reserved_words {
interned_set.push(interner.intern_rule(rule, None)?);
}
reserved_words.push(ReservedWordContext {
name: reserved_word_set.name.clone(),
reserved_words: interned_set,
});
}
let mut expected_conflicts = Vec::with_capacity(grammar.expected_conflicts.len());
for conflict in &grammar.expected_conflicts { for conflict in &grammar.expected_conflicts {
let mut interned_conflict = Vec::with_capacity(conflict.len()); let mut interned_conflict = Vec::with_capacity(conflict.len());
for name in conflict { for name in conflict {
interned_conflict.push( interned_conflict.push(
interner interner
.intern_name(name) .intern_name(name)
.ok_or_else(|| InternSymbolsError::UndefinedConflict(name.clone()))?, .ok_or_else(|| anyhow!("Undefined symbol `{name}`"))?,
); );
} }
expected_conflicts.push(interned_conflict); expected_conflicts.push(interned_conflict);
@ -95,15 +64,14 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
} }
} }
let word_token = if let Some(name) = grammar.word_token.as_ref() { let mut word_token = None;
Some( if let Some(name) = grammar.word_token.as_ref() {
word_token = Some(
interner interner
.intern_name(name) .intern_name(name)
.ok_or_else(|| InternSymbolsError::UndefinedWordToken(name.clone()))?, .ok_or_else(|| anyhow!("Undefined symbol `{name}`"))?,
) );
} else { }
None
};
for (i, variable) in variables.iter_mut().enumerate() { for (i, variable) in variables.iter_mut().enumerate() {
if supertype_symbols.contains(&Symbol::non_terminal(i)) { if supertype_symbols.contains(&Symbol::non_terminal(i)) {
@ -120,7 +88,6 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> InternSymbolsResult<Inte
supertype_symbols, supertype_symbols,
word_token, word_token,
precedence_orderings: grammar.precedence_orderings.clone(), precedence_orderings: grammar.precedence_orderings.clone(),
reserved_word_sets: reserved_words,
}) })
} }
@ -128,38 +95,34 @@ struct Interner<'a> {
grammar: &'a InputGrammar, grammar: &'a InputGrammar,
} }
impl Interner<'_> { impl<'a> Interner<'a> {
fn intern_rule(&self, rule: &Rule, name: Option<&str>) -> InternSymbolsResult<Rule> { fn intern_rule(&self, rule: &Rule) -> Result<Rule> {
match rule { match rule {
Rule::Choice(elements) => { Rule::Choice(elements) => {
self.check_single(elements, name, "choice");
let mut result = Vec::with_capacity(elements.len()); let mut result = Vec::with_capacity(elements.len());
for element in elements { for element in elements {
result.push(self.intern_rule(element, name)?); result.push(self.intern_rule(element)?);
} }
Ok(Rule::Choice(result)) Ok(Rule::Choice(result))
} }
Rule::Seq(elements) => { Rule::Seq(elements) => {
self.check_single(elements, name, "seq");
let mut result = Vec::with_capacity(elements.len()); let mut result = Vec::with_capacity(elements.len());
for element in elements { for element in elements {
result.push(self.intern_rule(element, name)?); result.push(self.intern_rule(element)?);
} }
Ok(Rule::Seq(result)) Ok(Rule::Seq(result))
} }
Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content, name)?))), Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))),
Rule::Metadata { rule, params } => Ok(Rule::Metadata { Rule::Metadata { rule, params } => Ok(Rule::Metadata {
rule: Box::new(self.intern_rule(rule, name)?), rule: Box::new(self.intern_rule(rule)?),
params: params.clone(), params: params.clone(),
}), }),
Rule::Reserved { rule, context_name } => Ok(Rule::Reserved {
rule: Box::new(self.intern_rule(rule, name)?),
context_name: context_name.clone(),
}),
Rule::NamedSymbol(name) => self.intern_name(name).map_or_else( Rule::NamedSymbol(name) => self.intern_name(name).map_or_else(
|| Err(InternSymbolsError::Undefined(name.clone())), || Err(anyhow!("Undefined symbol `{name}`")),
|symbol| Ok(Rule::Symbol(symbol)), |symbol| Ok(Rule::Symbol(symbol)),
), ),
_ => Ok(rule.clone()), _ => Ok(rule.clone()),
} }
} }
@ -181,17 +144,6 @@ impl Interner<'_> {
None None
} }
// In the case of a seq or choice rule of 1 element in a hidden rule, weird
// inconsistent behavior with queries can occur. So we should warn the user about it.
fn check_single(&self, elements: &[Rule], name: Option<&str>, kind: &str) {
if elements.len() == 1 && matches!(elements[0], Rule::String(_) | Rule::Pattern(_, _)) {
warn!(
"rule {} contains a `{kind}` rule with a single element. This is unnecessary.",
name.unwrap_or_default()
);
}
}
} }
fn variable_type_for_name(name: &str) -> VariableType { fn variable_type_for_name(name: &str) -> VariableType {
@ -278,9 +230,10 @@ mod tests {
fn test_grammar_with_undefined_symbols() { fn test_grammar_with_undefined_symbols() {
let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))])); let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))]));
assert!(result.is_err(), "Expected an error but got none"); match result {
let e = result.err().unwrap(); Err(e) => assert_eq!(e.to_string(), "Undefined symbol `y`"),
assert_eq!(e.to_string(), "Undefined symbol `y`"); _ => panic!("Expected an error but got none"),
}
} }
fn build_grammar(variables: Vec<Variable>) -> InputGrammar { fn build_grammar(variables: Vec<Variable>) -> InputGrammar {

View file

@ -6,36 +6,26 @@ mod flatten_grammar;
mod intern_symbols; mod intern_symbols;
mod process_inlines; mod process_inlines;
pub use self::expand_tokens::expand_tokens;
use self::expand_repeats::expand_repeats;
use self::extract_default_aliases::extract_default_aliases;
use self::extract_tokens::extract_tokens;
use self::flatten_grammar::flatten_grammar;
use self::intern_symbols::intern_symbols;
use self::process_inlines::process_inlines;
use super::grammars::{
ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, PrecedenceEntry,
SyntaxGrammar, Variable,
};
use super::rules::{AliasMap, Precedence, Rule, Symbol};
use anyhow::{anyhow, Result};
use std::{ use std::{
cmp::Ordering, cmp::Ordering,
collections::{hash_map, BTreeSet, HashMap, HashSet}, collections::{hash_map, HashMap, HashSet},
mem, mem,
}; };
pub use expand_tokens::ExpandTokensError;
pub use extract_tokens::ExtractTokensError;
pub use flatten_grammar::FlattenGrammarError;
use indexmap::IndexMap;
pub use intern_symbols::InternSymbolsError;
pub use process_inlines::ProcessInlinesError;
use serde::Serialize;
use thiserror::Error;
pub use self::expand_tokens::expand_tokens;
use self::{
expand_repeats::expand_repeats, extract_default_aliases::extract_default_aliases,
extract_tokens::extract_tokens, flatten_grammar::flatten_grammar,
intern_symbols::intern_symbols, process_inlines::process_inlines,
};
use super::{
grammars::{
ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, PrecedenceEntry,
SyntaxGrammar, Variable,
},
rules::{AliasMap, Precedence, Rule, Symbol},
};
use crate::grammars::ReservedWordContext;
pub struct IntermediateGrammar<T, U> { pub struct IntermediateGrammar<T, U> {
variables: Vec<Variable>, variables: Vec<Variable>,
extra_symbols: Vec<T>, extra_symbols: Vec<T>,
@ -45,7 +35,6 @@ pub struct IntermediateGrammar<T, U> {
variables_to_inline: Vec<Symbol>, variables_to_inline: Vec<Symbol>,
supertype_symbols: Vec<Symbol>, supertype_symbols: Vec<Symbol>,
word_token: Option<Symbol>, word_token: Option<Symbol>,
reserved_word_sets: Vec<ReservedWordContext<T>>,
} }
pub type InternedGrammar = IntermediateGrammar<Rule, Variable>; pub type InternedGrammar = IntermediateGrammar<Rule, Variable>;
@ -69,96 +58,21 @@ impl<T, U> Default for IntermediateGrammar<T, U> {
variables_to_inline: Vec::default(), variables_to_inline: Vec::default(),
supertype_symbols: Vec::default(), supertype_symbols: Vec::default(),
word_token: Option::default(), word_token: Option::default(),
reserved_word_sets: Vec::default(),
} }
} }
} }
pub type PrepareGrammarResult<T> = Result<T, PrepareGrammarError>;
#[derive(Debug, Error, Serialize)]
#[error(transparent)]
pub enum PrepareGrammarError {
ValidatePrecedences(#[from] ValidatePrecedenceError),
ValidateIndirectRecursion(#[from] IndirectRecursionError),
InternSymbols(#[from] InternSymbolsError),
ExtractTokens(#[from] ExtractTokensError),
FlattenGrammar(#[from] FlattenGrammarError),
ExpandTokens(#[from] ExpandTokensError),
ProcessInlines(#[from] ProcessInlinesError),
}
pub type ValidatePrecedenceResult<T> = Result<T, ValidatePrecedenceError>;
#[derive(Debug, Error, Serialize)]
#[error(transparent)]
pub enum ValidatePrecedenceError {
Undeclared(#[from] UndeclaredPrecedenceError),
Ordering(#[from] ConflictingPrecedenceOrderingError),
}
#[derive(Debug, Error, Serialize)]
pub struct IndirectRecursionError(pub Vec<String>);
impl std::fmt::Display for IndirectRecursionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Grammar contains an indirectly recursive rule: ")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, " -> ")?;
}
write!(f, "{symbol}")?;
}
Ok(())
}
}
#[derive(Debug, Error, Serialize)]
pub struct UndeclaredPrecedenceError {
pub precedence: String,
pub rule: String,
}
impl std::fmt::Display for UndeclaredPrecedenceError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Undeclared precedence '{}' in rule '{}'",
self.precedence, self.rule
)?;
Ok(())
}
}
#[derive(Debug, Error, Serialize)]
pub struct ConflictingPrecedenceOrderingError {
pub precedence_1: String,
pub precedence_2: String,
}
impl std::fmt::Display for ConflictingPrecedenceOrderingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Conflicting orderings for precedences {} and {}",
self.precedence_1, self.precedence_2
)?;
Ok(())
}
}
/// Transform an input grammar into separate components that are ready /// Transform an input grammar into separate components that are ready
/// for parse table construction. /// for parse table construction.
pub fn prepare_grammar( pub fn prepare_grammar(
input_grammar: &InputGrammar, input_grammar: &InputGrammar,
) -> PrepareGrammarResult<( ) -> Result<(
SyntaxGrammar, SyntaxGrammar,
LexicalGrammar, LexicalGrammar,
InlinedProductionMap, InlinedProductionMap,
AliasMap, AliasMap,
)> { )> {
validate_precedences(input_grammar)?; validate_precedences(input_grammar)?;
validate_indirect_recursion(input_grammar)?;
let interned_grammar = intern_symbols(input_grammar)?; let interned_grammar = intern_symbols(input_grammar)?;
let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
@ -170,115 +84,10 @@ pub fn prepare_grammar(
Ok((syntax_grammar, lexical_grammar, inlines, default_aliases)) Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
} }
/// Check for indirect recursion cycles in the grammar that can cause infinite loops while
/// parsing. An indirect recursion cycle occurs when a non-terminal can derive itself through
/// a chain of single-symbol productions (e.g., A -> B, B -> A).
fn validate_indirect_recursion(grammar: &InputGrammar) -> Result<(), IndirectRecursionError> {
let mut epsilon_transitions: IndexMap<&str, BTreeSet<String>> = IndexMap::new();
for variable in &grammar.variables {
let productions = get_single_symbol_productions(&variable.rule);
// Filter out rules that *directly* reference themselves, as this doesn't
// cause a parsing loop.
let filtered: BTreeSet<String> = productions
.into_iter()
.filter(|s| s != &variable.name)
.collect();
epsilon_transitions.insert(variable.name.as_str(), filtered);
}
for start_symbol in epsilon_transitions.keys() {
let mut visited = BTreeSet::new();
let mut path = Vec::new();
if let Some((start_idx, end_idx)) =
get_cycle(start_symbol, &epsilon_transitions, &mut visited, &mut path)
{
let cycle_symbols = path[start_idx..=end_idx]
.iter()
.map(|s| (*s).to_string())
.collect();
return Err(IndirectRecursionError(cycle_symbols));
}
}
Ok(())
}
fn get_single_symbol_productions(rule: &Rule) -> BTreeSet<String> {
match rule {
Rule::NamedSymbol(name) => BTreeSet::from([name.clone()]),
Rule::Choice(choices) => choices
.iter()
.flat_map(get_single_symbol_productions)
.collect(),
Rule::Metadata { rule, .. } => get_single_symbol_productions(rule),
_ => BTreeSet::new(),
}
}
/// Perform a depth-first search to detect cycles in single state transitions.
fn get_cycle<'a>(
current: &'a str,
transitions: &'a IndexMap<&'a str, BTreeSet<String>>,
visited: &mut BTreeSet<&'a str>,
path: &mut Vec<&'a str>,
) -> Option<(usize, usize)> {
if let Some(first_idx) = path.iter().position(|s| *s == current) {
path.push(current);
return Some((first_idx, path.len() - 1));
}
if visited.contains(current) {
return None;
}
path.push(current);
visited.insert(current);
if let Some(next_symbols) = transitions.get(current) {
for next in next_symbols {
if let Some(cycle) = get_cycle(next, transitions, visited, path) {
return Some(cycle);
}
}
}
path.pop();
None
}
/// Check that all of the named precedences used in the grammar are declared /// Check that all of the named precedences used in the grammar are declared
/// within the `precedences` lists, and also that there are no conflicting /// within the `precedences` lists, and also that there are no conflicting
/// precedence orderings declared in those lists. /// precedence orderings declared in those lists.
fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()> { fn validate_precedences(grammar: &InputGrammar) -> Result<()> {
// Check that no rule contains a named precedence that is not present in
// any of the `precedences` lists.
fn validate(
rule_name: &str,
rule: &Rule,
names: &HashSet<&String>,
) -> ValidatePrecedenceResult<()> {
match rule {
Rule::Repeat(rule) => validate(rule_name, rule, names),
Rule::Seq(elements) | Rule::Choice(elements) => elements
.iter()
.try_for_each(|e| validate(rule_name, e, names)),
Rule::Metadata { rule, params } => {
if let Precedence::Name(n) = &params.precedence {
if !names.contains(n) {
Err(UndeclaredPrecedenceError {
precedence: n.clone(),
rule: rule_name.to_string(),
})?;
}
}
validate(rule_name, rule, names)?;
Ok(())
}
_ => Ok(()),
}
}
// For any two precedence names `a` and `b`, if `a` comes before `b` // For any two precedence names `a` and `b`, if `a` comes before `b`
// in some list, then it cannot come *after* `b` in any list. // in some list, then it cannot come *after* `b` in any list.
let mut pairs = HashMap::new(); let mut pairs = HashMap::new();
@ -299,10 +108,9 @@ fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()>
} }
hash_map::Entry::Occupied(e) => { hash_map::Entry::Occupied(e) => {
if e.get() != &ordering { if e.get() != &ordering {
Err(ConflictingPrecedenceOrderingError { return Err(anyhow!(
precedence_1: entry1.to_string(), "Conflicting orderings for precedences {entry1} and {entry2}",
precedence_2: entry2.to_string(), ));
})?;
} }
} }
} }
@ -310,6 +118,27 @@ fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()>
} }
} }
// Check that no rule contains a named precedence that is not present in
// any of the `precedences` lists.
fn validate(rule_name: &str, rule: &Rule, names: &HashSet<&String>) -> Result<()> {
match rule {
Rule::Repeat(rule) => validate(rule_name, rule, names),
Rule::Seq(elements) | Rule::Choice(elements) => elements
.iter()
.try_for_each(|e| validate(rule_name, e, names)),
Rule::Metadata { rule, params } => {
if let Precedence::Name(n) = &params.precedence {
if !names.contains(n) {
return Err(anyhow!("Undeclared precedence '{n}' in rule '{rule_name}'"));
}
}
validate(rule_name, rule, names)?;
Ok(())
}
_ => Ok(()),
}
}
let precedence_names = grammar let precedence_names = grammar
.precedence_orderings .precedence_orderings
.iter() .iter()
@ -332,7 +161,7 @@ fn validate_precedences(grammar: &InputGrammar) -> ValidatePrecedenceResult<()>
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::grammars::VariableType; use crate::generate::grammars::{InputGrammar, Variable, VariableType};
#[test] #[test]
fn test_validate_precedences_with_undeclared_precedence() { fn test_validate_precedences_with_undeclared_precedence() {

View file

@ -1,17 +1,14 @@
use std::collections::HashMap; use crate::generate::{
use serde::Serialize;
use thiserror::Error;
use crate::{
grammars::{InlinedProductionMap, LexicalGrammar, Production, ProductionStep, SyntaxGrammar}, grammars::{InlinedProductionMap, LexicalGrammar, Production, ProductionStep, SyntaxGrammar},
rules::SymbolType, rules::SymbolType,
}; };
use anyhow::{anyhow, Result};
use std::collections::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
struct ProductionStepId { struct ProductionStepId {
// A `None` value here means that the production itself was produced via inlining, // A `None` value here means that the production itself was produced via inlining,
// and is stored in the builder's `productions` vector, as opposed to being // and is stored in the the builder's `productions` vector, as opposed to being
// stored in one of the grammar's variables. // stored in one of the grammar's variables.
variable_index: Option<usize>, variable_index: Option<usize>,
production_index: usize, production_index: usize,
@ -70,13 +67,12 @@ impl InlinedProductionMapBuilder {
let production_map = production_indices_by_step_id let production_map = production_indices_by_step_id
.into_iter() .into_iter()
.map(|(step_id, production_indices)| { .map(|(step_id, production_indices)| {
let production = let production = step_id.variable_index.map_or_else(
core::ptr::from_ref::<Production>(step_id.variable_index.map_or_else( || &productions[step_id.production_index],
|| &productions[step_id.production_index], |variable_index| {
|variable_index| { &grammar.variables[variable_index].productions[step_id.production_index]
&grammar.variables[variable_index].productions[step_id.production_index] },
}, ) as *const Production;
));
((production, step_id.step_index as u32), production_indices) ((production, step_id.step_index as u32), production_indices)
}) })
.collect(); .collect();
@ -156,7 +152,7 @@ impl InlinedProductionMapBuilder {
self.productions self.productions
.iter() .iter()
.position(|p| *p == production) .position(|p| *p == production)
.unwrap_or_else(|| { .unwrap_or({
self.productions.push(production); self.productions.push(production);
self.productions.len() - 1 self.productions.len() - 1
}) })
@ -189,38 +185,29 @@ impl InlinedProductionMapBuilder {
} }
} }
pub type ProcessInlinesResult<T> = Result<T, ProcessInlinesError>;
#[derive(Debug, Error, Serialize)]
pub enum ProcessInlinesError {
#[error("External token `{0}` cannot be inlined")]
ExternalToken(String),
#[error("Token `{0}` cannot be inlined")]
Token(String),
#[error("Rule `{0}` cannot be inlined because it is the first rule")]
FirstRule(String),
}
pub(super) fn process_inlines( pub(super) fn process_inlines(
grammar: &SyntaxGrammar, grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar, lexical_grammar: &LexicalGrammar,
) -> ProcessInlinesResult<InlinedProductionMap> { ) -> Result<InlinedProductionMap> {
for symbol in &grammar.variables_to_inline { for symbol in &grammar.variables_to_inline {
match symbol.kind { match symbol.kind {
SymbolType::External => { SymbolType::External => {
Err(ProcessInlinesError::ExternalToken( return Err(anyhow!(
grammar.external_tokens[symbol.index].name.clone(), "External token `{}` cannot be inlined",
))?; grammar.external_tokens[symbol.index].name
))
} }
SymbolType::Terminal => { SymbolType::Terminal => {
Err(ProcessInlinesError::Token( return Err(anyhow!(
lexical_grammar.variables[symbol.index].name.clone(), "Token `{}` cannot be inlined",
))?; lexical_grammar.variables[symbol.index].name,
))
} }
SymbolType::NonTerminal if symbol.index == 0 => { SymbolType::NonTerminal if symbol.index == 0 => {
Err(ProcessInlinesError::FirstRule( return Err(anyhow!(
grammar.variables[symbol.index].name.clone(), "Rule `{}` cannot be inlined because it is the first rule",
))?; grammar.variables[symbol.index].name,
))
} }
_ => {} _ => {}
} }
@ -236,10 +223,10 @@ pub(super) fn process_inlines(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::generate::grammars::{
grammars::{LexicalVariable, SyntaxVariable, VariableType}, LexicalVariable, ProductionStep, SyntaxVariable, VariableType,
rules::{Associativity, Precedence, Symbol},
}; };
use crate::generate::rules::{Associativity, Precedence, Symbol};
#[test] #[test]
fn test_basic_inlining() { fn test_basic_inlining() {
@ -377,10 +364,10 @@ mod tests {
let inline_map = process_inlines(&grammar, &LexicalGrammar::default()).unwrap(); let inline_map = process_inlines(&grammar, &LexicalGrammar::default()).unwrap();
let productions = inline_map let productions: Vec<&Production> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1) .inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap() .unwrap()
.collect::<Vec<_>>(); .collect();
assert_eq!( assert_eq!(
productions.iter().copied().cloned().collect::<Vec<_>>(), productions.iter().copied().cloned().collect::<Vec<_>>(),
@ -476,10 +463,10 @@ mod tests {
let inline_map = process_inlines(&grammar, &LexicalGrammar::default()).unwrap(); let inline_map = process_inlines(&grammar, &LexicalGrammar::default()).unwrap();
let productions = inline_map let productions: Vec<_> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0) .inlined_productions(&grammar.variables[0].productions[0], 0)
.unwrap() .unwrap()
.collect::<Vec<_>>(); .collect();
assert_eq!( assert_eq!(
productions.iter().copied().cloned().collect::<Vec<_>>(), productions.iter().copied().cloned().collect::<Vec<_>>(),
@ -549,9 +536,10 @@ mod tests {
..Default::default() ..Default::default()
}; };
let result = process_inlines(&grammar, &lexical_grammar); if let Err(error) = process_inlines(&grammar, &lexical_grammar) {
assert!(result.is_err(), "expected an error, but got none"); assert_eq!(error.to_string(), "Token `something` cannot be inlined");
let err = result.err().unwrap(); } else {
assert_eq!(err.to_string(), "Token `something` cannot be inlined",); panic!("expected an error, but got none");
}
} }
} }

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
{"Other":"C","Control":"Cc","cntrl":"Cc","Format":"Cf","Unassigned":"Cn","Private_Use":"Co","Surrogate":"Cs","Letter":"L","Cased_Letter":"LC","Lowercase_Letter":"Ll","Modifier_Letter":"Lm","Other_Letter":"Lo","Titlecase_Letter":"Lt","Uppercase_Letter":"Lu","Mark":"M","Combining_Mark":"M","Spacing_Mark":"Mc","Enclosing_Mark":"Me","Nonspacing_Mark":"Mn","Number":"N","Decimal_Number":"Nd","digit":"Nd","Letter_Number":"Nl","Other_Number":"No","Punctuation":"P","punct":"P","Connector_Punctuation":"Pc","Dash_Punctuation":"Pd","Close_Punctuation":"Pe","Final_Punctuation":"Pf","Initial_Punctuation":"Pi","Other_Punctuation":"Po","Open_Punctuation":"Ps","Symbol":"S","Currency_Symbol":"Sc","Modifier_Symbol":"Sk","Math_Symbol":"Sm","Other_Symbol":"So","Separator":"Z","Line_Separator":"Zl","Paragraph_Separator":"Zp","Space_Separator":"Zs"}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","EqUIdeo":"Equivalent_Unified_Ideograph","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"}

File diff suppressed because it is too large Load diff

View file

@ -1,11 +1,9 @@
use std::{collections::BTreeMap, fmt};
use serde::Serialize;
use smallbitvec::SmallBitVec;
use super::grammars::VariableType; use super::grammars::VariableType;
use smallbitvec::SmallBitVec;
use std::iter::FromIterator;
use std::{collections::HashMap, fmt};
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum SymbolType { pub enum SymbolType {
External, External,
End, End,
@ -14,19 +12,19 @@ pub enum SymbolType {
NonTerminal, NonTerminal,
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum Associativity { pub enum Associativity {
Left, Left,
Right, Right,
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Alias { pub struct Alias {
pub value: String, pub value: String,
pub is_named: bool, pub is_named: bool,
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
pub enum Precedence { pub enum Precedence {
#[default] #[default]
None, None,
@ -34,50 +32,48 @@ pub enum Precedence {
Name(String), Name(String),
} }
pub type AliasMap = BTreeMap<Symbol, Alias>; pub type AliasMap = HashMap<Symbol, Alias>;
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize)] #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
pub struct MetadataParams { pub struct MetadataParams {
pub precedence: Precedence, pub precedence: Precedence,
pub dynamic_precedence: i32, pub dynamic_precedence: i32,
pub associativity: Option<Associativity>, pub associativity: Option<Associativity>,
pub is_token: bool, pub is_token: bool,
pub is_string: bool,
pub is_active: bool,
pub is_main_token: bool, pub is_main_token: bool,
pub alias: Option<Alias>, pub alias: Option<Alias>,
pub field_name: Option<String>, pub field_name: Option<String>,
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Symbol { pub struct Symbol {
pub kind: SymbolType, pub kind: SymbolType,
pub index: usize, pub index: usize,
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum Rule { pub enum Rule {
Blank, Blank,
String(String), String(String),
Pattern(String, String), Pattern(String, String),
NamedSymbol(String), NamedSymbol(String),
Symbol(Symbol), Symbol(Symbol),
Choice(Vec<Self>), Choice(Vec<Rule>),
Metadata { Metadata {
params: MetadataParams, params: MetadataParams,
rule: Box<Self>, rule: Box<Rule>,
},
Repeat(Box<Self>),
Seq(Vec<Self>),
Reserved {
rule: Box<Self>,
context_name: String,
}, },
Repeat(Box<Rule>),
Seq(Vec<Rule>),
} }
// Because tokens are represented as small (~400 max) unsigned integers, // Because tokens are represented as small (~400 max) unsigned integers,
// sets of tokens can be efficiently represented as bit vectors with each // sets of tokens can be efficiently represented as bit vectors with each
// index corresponding to a token, and each value representing whether or not // index corresponding to a token, and each value representing whether or not
// the token is present in the set. // the token is present in the set.
#[derive(Default, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenSet { pub struct TokenSet {
terminal_bits: SmallBitVec, terminal_bits: SmallBitVec,
external_bits: SmallBitVec, external_bits: SmallBitVec,
@ -85,32 +81,6 @@ pub struct TokenSet {
end_of_nonterminal_extra: bool, end_of_nonterminal_extra: bool,
} }
impl fmt::Debug for TokenSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_list().entries(self.iter()).finish()
}
}
impl PartialOrd for TokenSet {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for TokenSet {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.terminal_bits
.iter()
.cmp(other.terminal_bits.iter())
.then_with(|| self.external_bits.iter().cmp(other.external_bits.iter()))
.then_with(|| self.eof.cmp(&other.eof))
.then_with(|| {
self.end_of_nonterminal_extra
.cmp(&other.end_of_nonterminal_extra)
})
}
}
impl Rule { impl Rule {
pub fn field(name: String, content: Self) -> Self { pub fn field(name: String, content: Self) -> Self {
add_metadata(content, move |params| { add_metadata(content, move |params| {
@ -175,21 +145,9 @@ impl Rule {
Self::Choice(elements) Self::Choice(elements)
} }
pub const fn seq(rules: Vec<Self>) -> Self { pub fn seq(rules: Vec<Self>) -> Self {
Self::Seq(rules) Self::Seq(rules)
} }
pub fn is_empty(&self) -> bool {
match self {
Self::Blank | Self::Pattern(..) | Self::NamedSymbol(_) | Self::Symbol(_) => false,
Self::String(string) => string.is_empty(),
Self::Metadata { rule, .. } | Self::Repeat(rule) | Self::Reserved { rule, .. } => {
rule.is_empty()
}
Self::Choice(rules) => rules.iter().any(Self::is_empty),
Self::Seq(rules) => rules.iter().all(Self::is_empty),
}
}
} }
impl Alias { impl Alias {
@ -306,14 +264,14 @@ impl Symbol {
} }
impl From<Symbol> for Rule { impl From<Symbol> for Rule {
#[must_use]
fn from(symbol: Symbol) -> Self { fn from(symbol: Symbol) -> Self {
Self::Symbol(symbol) Self::Symbol(symbol)
} }
} }
impl TokenSet { impl TokenSet {
#[must_use] pub fn new() -> Self {
pub const fn new() -> Self {
Self { Self {
terminal_bits: SmallBitVec::new(), terminal_bits: SmallBitVec::new(),
external_bits: SmallBitVec::new(), external_bits: SmallBitVec::new(),
@ -424,9 +382,6 @@ impl TokenSet {
}; };
if other.index < vec.len() && vec[other.index] { if other.index < vec.len() && vec[other.index] {
vec.set(other.index, false); vec.set(other.index, false);
while vec.last() == Some(false) {
vec.pop();
}
return true; return true;
} }
false false
@ -439,13 +394,6 @@ impl TokenSet {
&& !self.external_bits.iter().any(|a| a) && !self.external_bits.iter().any(|a| a)
} }
pub fn len(&self) -> usize {
self.eof as usize
+ self.end_of_nonterminal_extra as usize
+ self.terminal_bits.iter().filter(|b| *b).count()
+ self.external_bits.iter().filter(|b| *b).count()
}
pub fn insert_all_terminals(&mut self, other: &Self) -> bool { pub fn insert_all_terminals(&mut self, other: &Self) -> bool {
let mut result = false; let mut result = false;
if other.terminal_bits.len() > self.terminal_bits.len() { if other.terminal_bits.len() > self.terminal_bits.len() {

View file

@ -1,9 +1,6 @@
use super::nfa::CharacterSet;
use super::rules::{Alias, Symbol, TokenSet};
use std::collections::BTreeMap; use std::collections::BTreeMap;
use super::{
nfa::CharacterSet,
rules::{Alias, Symbol, TokenSet},
};
pub type ProductionInfoId = usize; pub type ProductionInfoId = usize;
pub type ParseStateId = usize; pub type ParseStateId = usize;
pub type LexStateId = usize; pub type LexStateId = usize;
@ -47,7 +44,6 @@ pub struct ParseState {
pub id: ParseStateId, pub id: ParseStateId,
pub terminal_entries: IndexMap<Symbol, ParseTableEntry, BuildHasherDefault<FxHasher>>, pub terminal_entries: IndexMap<Symbol, ParseTableEntry, BuildHasherDefault<FxHasher>>,
pub nonterminal_entries: IndexMap<Symbol, GotoAction, BuildHasherDefault<FxHasher>>, pub nonterminal_entries: IndexMap<Symbol, GotoAction, BuildHasherDefault<FxHasher>>,
pub reserved_words: TokenSet,
pub lex_state_id: usize, pub lex_state_id: usize,
pub external_lex_state_id: usize, pub external_lex_state_id: usize,
pub core_id: usize, pub core_id: usize,
@ -65,7 +61,7 @@ pub struct ProductionInfo {
pub field_map: BTreeMap<String, Vec<FieldLocation>>, pub field_map: BTreeMap<String, Vec<FieldLocation>>,
} }
#[derive(Debug, Default, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct ParseTable { pub struct ParseTable {
pub states: Vec<ParseState>, pub states: Vec<ParseState>,
pub symbols: Vec<Symbol>, pub symbols: Vec<Symbol>,
@ -93,7 +89,6 @@ pub struct LexTable {
} }
impl ParseTableEntry { impl ParseTableEntry {
#[must_use]
pub const fn new() -> Self { pub const fn new() -> Self {
Self { Self {
reusable: true, reusable: true,

View file

@ -0,0 +1,28 @@
#include "tree_sitter/parser.h"
#include <node.h>
#include "nan.h"
using namespace v8;
extern "C" TSLanguage * tree_sitter_PARSER_NAME();
namespace {
NAN_METHOD(New) {}
void Init(Local<Object> exports, Local<Object> module) {
Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
tpl->SetClassName(Nan::New("Language").ToLocalChecked());
tpl->InstanceTemplate()->SetInternalFieldCount(1);
Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
Nan::SetInternalFieldPointer(instance, 0, tree_sitter_PARSER_NAME());
Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("PARSER_NAME").ToLocalChecked());
Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
}
NODE_MODULE(tree_sitter_PARSER_NAME_binding, Init)
} // namespace

View file

@ -0,0 +1,19 @@
{
"targets": [
{
"target_name": "tree_sitter_PARSER_NAME_binding",
"include_dirs": [
"<!(node -e \"require('nan')\")",
"src"
],
"sources": [
"bindings/node/binding.cc",
"src/parser.c",
# If your language uses an external scanner, add it here.
],
"cflags_c": [
"-std=c99",
]
}
]
}

View file

@ -0,0 +1,40 @@
fn main() {
let src_dir = std::path::Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
.flag_if_supported("-Wno-trigraphs");
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
// If your language uses an external scanner written in C,
// then include this block of code:
/*
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
// If your language uses an external scanner written in C++,
// then include this block of code:
/*
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
}

View file

@ -0,0 +1,26 @@
[package]
name = "tree-sitter-PARSER_NAME"
description = "PARSER_NAME grammar for the tree-sitter parsing library"
version = "0.0.1"
keywords = ["incremental", "parsing", "PARSER_NAME"]
categories = ["parsing", "text-editors"]
repository = "https://github.com/tree-sitter/tree-sitter-PARSER_NAME"
edition = "2018"
license = "MIT"
build = "bindings/rust/build.rs"
include = [
"bindings/rust/*",
"grammar.js",
"queries/*",
"src/*",
]
[lib]
path = "bindings/rust/lib.rs"
[dependencies]
tree-sitter = "~RUST_BINDING_VERSION"
[build-dependencies]
cc = "1.0"

View file

@ -0,0 +1,19 @@
try {
module.exports = require("../../build/Release/tree_sitter_PARSER_NAME_binding");
} catch (error1) {
if (error1.code !== 'MODULE_NOT_FOUND') {
throw error1;
}
try {
module.exports = require("../../build/Debug/tree_sitter_PARSER_NAME_binding");
} catch (error2) {
if (error2.code !== 'MODULE_NOT_FOUND') {
throw error2;
}
throw error1
}
}
try {
module.exports.nodeTypeInfo = require("../../src/node-types.json");
} catch (_) {}

Some files were not shown because too many files have changed in this diff Show more