diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..79134428 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,43 @@ +name: Deploy Docs +on: + push: + branches: + - "master" + +jobs: + deploy-docs: + runs-on: ubuntu-latest + + permissions: + contents: write + pages: write + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install mdbook + env: + GH_TOKEN: ${{ github.token }} + run: | + jq_expr='.assets[] | select(.name | contains("x86_64-unknown-linux-gnu")) | .browser_download_url' + url=$(gh api repos/rust-lang/mdbook/releases/latest --jq "$jq_expr") + mkdir mdbook + curl -sSL "$url" | tar -xz -C mdbook + printf '%s/mdbook\n' "$PWD" >> "$GITHUB_PATH" + + - name: Build Book + run: mdbook build docs + + - name: Setup Pages + uses: actions/configure-pages@v4 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs/book + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/cli/src/playground.html b/cli/src/playground.html index 420cd28d..09328a19 100644 --- a/cli/src/playground.html +++ b/cli/src/playground.html @@ -1,10 +1,12 @@ tree-sitter THE_LANGUAGE_NAME - - - - + + + + @@ -19,6 +21,11 @@ +
+ + +
+
@@ -55,13 +62,11 @@
- - - + + @@ -103,7 +108,8 @@ flex-direction: column; } - #code-container, #query-container { + #code-container, + #query-container { flex: 1; position: relative; overflow: hidden; diff --git a/docs/.gitignore b/docs/.gitignore index 339efff8..7585238e 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1 @@ -vendor -_site -.bundle +book diff --git a/docs/Gemfile b/docs/Gemfile deleted file mode 100644 index ee114290..00000000 --- a/docs/Gemfile +++ /dev/null @@ -1,3 +0,0 @@ -source 'https://rubygems.org' -gem 'github-pages', group: :jekyll_plugins -gem "webrick" diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock deleted file mode 100644 index 7204a9a7..00000000 --- a/docs/Gemfile.lock +++ /dev/null @@ -1,273 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activesupport (7.1.3) - base64 - bigdecimal - concurrent-ruby (~> 1.0, >= 1.0.2) - connection_pool (>= 2.2.5) - drb - i18n (>= 1.6, < 2) - minitest (>= 5.1) - mutex_m - tzinfo (~> 2.0) - addressable (2.8.1) - public_suffix (>= 2.0.2, < 6.0) - base64 (0.2.0) - bigdecimal (3.1.6) - coffee-script (2.4.1) - coffee-script-source - execjs - coffee-script-source (1.11.1) - colorator (1.1.0) - commonmarker (0.23.10) - concurrent-ruby (1.2.3) - connection_pool (2.4.1) - dnsruby (1.61.9) - simpleidn (~> 0.1) - drb (2.2.0) - ruby2_keywords - em-websocket (0.5.3) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0) - ethon (0.16.0) - ffi (>= 1.15.0) - eventmachine (1.2.7) - execjs (2.8.1) - faraday (2.7.4) - faraday-net_http (>= 2.0, < 3.1) - ruby2_keywords (>= 0.0.4) - faraday-net_http (3.0.2) - ffi (1.15.5) - forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (228) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.3) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.4.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) - jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) - jekyll-include-cache (= 0.2.1) - jekyll-mentions (= 1.6.0) - jekyll-optional-front-matter (= 0.3.2) - jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.16.0) - jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.3) - jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.8.0) - jekyll-sitemap (= 1.4.0) - jekyll-swiss (= 1.0.0) - jekyll-theme-architect (= 0.2.0) - jekyll-theme-cayman (= 0.2.0) - jekyll-theme-dinky (= 0.2.0) - jekyll-theme-hacker (= 0.2.0) - jekyll-theme-leap-day (= 0.2.0) - jekyll-theme-merlot (= 0.2.0) - jekyll-theme-midnight (= 0.2.0) - jekyll-theme-minimal (= 0.2.0) - jekyll-theme-modernist (= 0.2.0) - jekyll-theme-primer (= 0.6.0) - jekyll-theme-slate (= 0.2.0) - jekyll-theme-tactile (= 0.2.0) - jekyll-theme-time-machine (= 0.2.0) - jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.2) - kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.4) - mercenary (~> 0.3) - minima (= 2.5.1) - nokogiri (>= 1.13.6, < 2.0) - rouge (= 3.26.0) - terminal-table (~> 1.4) - github-pages-health-check (1.17.9) - addressable (~> 2.3) - dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) - typhoeus (~> 1.3) - html-pipeline (2.14.3) - activesupport (>= 2) - nokogiri (>= 1.4) - http_parser.rb (0.8.0) - i18n (1.14.1) - concurrent-ruby (~> 1.0) - jekyll (3.9.3) - addressable (~> 2.4) - colorator (~> 1.0) - em-websocket (~> 0.5) - i18n (>= 0.7, < 2) - jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 2.0) - kramdown (>= 1.17, < 3) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (>= 1.7, < 4) - safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) - jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) - coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.4.0) - commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.4.0) - commonmarker (~> 0.23.7) - jekyll (~> 3.9.0) - jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 5.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) - jekyll (>= 3.7, < 5.0) - jekyll-gist (1.5.0) - octokit (~> 4.2) - jekyll-github-metadata (2.13.0) - jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) - jekyll-include-cache (0.2.1) - jekyll (>= 3.7, < 5.0) - jekyll-mentions (1.6.0) - html-pipeline (~> 2.3) - jekyll (>= 3.7, < 5.0) - jekyll-optional-front-matter (0.3.2) - jekyll (>= 3.0, < 5.0) - jekyll-paginate (1.1.0) - jekyll-readme-index (0.3.0) - jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.16.0) - jekyll (>= 3.3, < 5.0) - jekyll-relative-links (0.6.1) - jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.3) - addressable (~> 2.0) - jekyll (>= 3.5, < 5.0) - jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) - rubyzip (>= 1.3.0, < 3.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-seo-tag (2.8.0) - jekyll (>= 3.8, < 5.0) - jekyll-sitemap (1.4.0) - jekyll (>= 3.7, < 5.0) - jekyll-swiss (1.0.0) - jekyll-theme-architect (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.6.0) - jekyll (> 3.5, < 5.0) - jekyll-github-metadata (~> 2.9) - jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.3) - jekyll (>= 3.3, < 5.0) - jekyll-watch (2.2.1) - listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) - html-pipeline (~> 2.2) - jekyll (>= 3.0, < 5.0) - kramdown (2.3.2) - rexml - kramdown-parser-gfm (1.1.0) - kramdown (~> 2.0) - liquid (4.0.4) - listen (3.8.0) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - minima (2.5.1) - jekyll (>= 3.5, < 5.0) - jekyll-feed (~> 0.9) - jekyll-seo-tag (~> 2.1) - minitest (5.21.2) - mutex_m (0.2.0) - nokogiri (1.16.5-x86_64-linux) - racc (~> 1.4) - octokit (4.25.1) - faraday (>= 1, < 3) - sawyer (~> 0.9) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (4.0.7) - racc (1.7.3) - rb-fsevent (0.11.2) - rb-inotify (0.10.1) - ffi (~> 1.0) - rexml (3.3.3) - strscan - rouge (3.26.0) - ruby2_keywords (0.0.5) - rubyzip (2.3.2) - safe_yaml (1.0.5) - sass (3.7.4) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.9.2) - addressable (>= 2.3.5) - faraday (>= 0.17.3, < 3) - simpleidn (0.2.1) - unf (~> 0.1.4) - strscan (3.1.0) - terminal-table (1.8.0) - unicode-display_width (~> 1.1, >= 1.1.1) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) - unf (0.1.4) - unf_ext - unf_ext (0.0.8.2) - unicode-display_width (1.8.0) - webrick (1.8.1) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - github-pages - webrick - -BUNDLED WITH - 2.4.8 diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index 891551df..00000000 --- a/docs/_config.yml +++ /dev/null @@ -1,2 +0,0 @@ -markdown: kramdown -theme: jekyll-theme-cayman diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html deleted file mode 100644 index 587ab4f0..00000000 --- a/docs/_layouts/default.html +++ /dev/null @@ -1,137 +0,0 @@ - - - - - - - - Tree-sitter|{{ page.title }} - - - - - - - -
- - - - -
- {{ content }} -
-
- - - - - - - - diff --git a/docs/assets/css/playground.css b/docs/assets/css/playground.css new file mode 100644 index 00000000..71d373a4 --- /dev/null +++ b/docs/assets/css/playground.css @@ -0,0 +1,372 @@ +/* Base Variables */ +:root { + --light-bg: #f9f9f9; + --light-border: #e0e0e0; + --light-text: #333; + --light-hover-border: #c1c1c1; + --light-scrollbar-track: #f1f1f1; + --light-scrollbar-thumb: #c1c1c1; + --light-scrollbar-thumb-hover: #a8a8a8; + + --dark-bg: #1d1f21; + --dark-border: #2d2d2d; + --dark-text: #c5c8c6; + --dark-scrollbar-track: #25282c; + --dark-scrollbar-thumb: #4a4d51; + --dark-scrollbar-thumb-hover: #5a5d61; + + --primary-color: #0550ae; + --primary-color-alpha: rgba(5, 80, 174, 0.1); + --primary-color-alpha-dark: rgba(121, 192, 255, 0.1); + --selection-color: rgba(39, 95, 255, 0.3); +} + +/* Common Scrollbar Styles */ +::-webkit-scrollbar { + width: 8px; + height: 8px; +} + +::-webkit-scrollbar-track { + border-radius: 4px; +} + +::-webkit-scrollbar-thumb { + border-radius: 4px; +} + +/* Base Light Theme Scrollbars */ +::-webkit-scrollbar-track { + background: var(--light-scrollbar-track); +} + +::-webkit-scrollbar-thumb { + background: var(--light-scrollbar-thumb); +} + +::-webkit-scrollbar-thumb:hover { + background: var(--light-scrollbar-thumb-hover); +} + +/* Dropdown Styling */ +.custom-select { + position: relative; + display: inline-block; +} + +#language-select { + background-color: var(--light-bg); + border: 1px solid var(--light-border); + border-radius: 4px; + padding: 4px 24px 4px 8px; + font-size: 14px; + color: var(--light-text); + cursor: pointer; + min-width: 120px; + appearance: none; + background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%23666' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'%3E%3C/polyline%3E%3C/svg%3E"); + background-repeat: no-repeat; + background-position: right 8px center; +} + +.select-button { + background-color: var(--light-bg); + border: 1px solid var(--light-border); + border-radius: 4px; + padding: 4px 8px; + font-size: 14px; + color: var(--light-text); + cursor: pointer; + min-width: 120px; + display: flex; + align-items: center; + justify-content: space-between; +} + +#language-select:hover, +.select-button:hover { + border-color: var(--light-hover-border); +} + +#language-select:focus, +.select-button:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 2px var(--primary-color-alpha); +} + +/* Custom Checkbox Styling */ +input[type="checkbox"] { + appearance: none; + width: 16px; + height: 16px; + border: 1px solid var(--light-border); + border-radius: 3px; + margin-right: 6px; + position: relative; + cursor: pointer; + vertical-align: middle; +} + +input[type="checkbox"]:checked { + background-color: var(--primary-color); + border-color: var(--primary-color); +} + +input[type="checkbox"]:checked::after { + content: ''; + position: absolute; + left: 5px; + top: 2px; + width: 4px; + height: 8px; + border: solid white; + border-width: 0 2px 2px 0; + transform: rotate(45deg); +} + +input[type="checkbox"]:hover { + border-color: var(--light-hover-border); +} + +input[type="checkbox"]:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 2px var(--primary-color-alpha); +} + +/* Select Dropdown */ +.select-dropdown { + position: absolute; + top: 100%; + left: 0; + right: 0; + background-color: var(--light-bg); + border: 1px solid var(--light-border); + border-radius: 4px; + margin-top: 4px; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + display: none; + z-index: 1000; + max-height: 300px; + overflow-y: auto; +} + +.select-dropdown.show { + display: block; +} + +.option { + padding: 8px 12px; + cursor: pointer; +} + +.option:hover { + background-color: var(--primary-color-alpha); +} + +.option.selected { + background-color: var(--primary-color-alpha); +} + +/* CodeMirror Base Styles */ +.ts-playground .CodeMirror { + border-radius: 6px; + background-color: var(--light-bg) !important; + color: #080808 !important; +} + +.ts-playground .CodeMirror-scroll { + padding: 8px; + border: 1px solid var(--light-border); + border-radius: 6px; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.1); +} + +.ayu .ts-playground .CodeMirror-scroll, +.coal .ts-playground .CodeMirror-scroll, +.navy .ts-playground .CodeMirror-scroll { + border-color: var(--dark-border); +} + +.ts-playground .CodeMirror-gutters { + background: #ebebeb !important; + border-right: 1px solid #e8e8e8 !important; +} + +.ts-playground .CodeMirror-cursor { + border-left: 2px solid #000 !important; +} + +.ts-playground .CodeMirror-selected { + background: var(--selection-color) !important; +} + +.ts-playground .CodeMirror-activeline-background { + background: rgba(36, 99, 180, 0.12) !important; +} + +/* Output Container Styles */ +#output-container { + color: #080808; + background-color: var(--light-bg); + margin: 0; + white-space: pre; + font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Consolas, "Liberation Mono", monospace; +} + +#output-container-scroll { + max-height: 400px; + overflow: auto; + padding: 8px; + border: 1px solid var(--light-border); + border-radius: 6px; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.1); + background-color: var(--light-bg); +} + +#output-container a { + color: var(--primary-color); + text-decoration: none; +} + +#output-container a.node-link.anonymous { + color: #116329; +} + +#output-container a.node-link.anonymous:before { + content: '"'; +} + +#output-container a.node-link.anonymous:after { + content: '"'; +} + +#output-container a.node-link.error { + color: #cf222e; +} + +#output-container a.highlighted { + background-color: var(--selection-color); +} + +/* Dark Theme Overrides */ +.ayu, .coal, .navy { + & #language-select, + & .select-button { + background-color: var(--dark-bg); + border-color: var(--dark-border); + color: var(--dark-text); + } + + & input[type="checkbox"] { + border-color: var(--dark-border); + background-color: var(--dark-bg); + } + + & input[type="checkbox"]:checked { + background-color: #79c0ff; + border-color: #79c0ff; + } + + & label { + color: var(--dark-text); + } + + & .select-dropdown { + background-color: var(--dark-bg); + border-color: var(--dark-border); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3); + } + + & .option:hover { + background-color: var(--primary-color-alpha-dark); + } + + & .option.selected { + background-color: var(--primary-color-alpha-dark); + } + + & .ts-playground .CodeMirror { + background-color: var(--dark-bg) !important; + color: var(--dark-text) !important; + } + + & .ts-playground .CodeMirror-gutters { + background: var(--dark-scrollbar-track) !important; + border-right-color: var(--dark-border) !important; + } + + & .ts-playground .CodeMirror-cursor { + border-left-color: #aeafad !important; + } + + & .ts-playground .CodeMirror-selected { + background: #373b41 !important; + } + + & .ts-playground .CodeMirror-activeline-background { + background: #282a2e !important; + } + + & #output-container { + color: var(--dark-text); + background-color: var(--dark-bg); + } + + & #output-container-scroll { + background-color: var(--dark-bg); + border-color: var(--dark-border); + } + + & #output-container a { + color: #79c0ff; + } + + & #output-container a.node-link.anonymous { + color: #7ee787; + } + + & #output-container a.node-link.error { + color: #ff7b72; + } + + & #output-container a.highlighted { + background-color: #373b41; + } + + /* Dark Theme Scrollbars */ + & ::-webkit-scrollbar-track { + background: var(--dark-scrollbar-track) !important; + } + + & ::-webkit-scrollbar-thumb { + background: var(--dark-scrollbar-thumb) !important; + } + + & ::-webkit-scrollbar-thumb:hover { + background: var(--dark-scrollbar-thumb-hover) !important; + } + + & * { + scrollbar-width: thin !important; + scrollbar-color: var(--dark-scrollbar-thumb) var(--dark-scrollbar-track) !important; + } +} + +/* Spacing Utilities */ +#language-select, +input[type="checkbox"], +label { + margin: 0 4px; +} + +#language-select { + margin-right: 16px; +} + +label { + font-size: 14px; + margin-right: 16px; + cursor: pointer; +} diff --git a/docs/assets/js/playground.js b/docs/assets/js/playground.js index 5864d979..9a5cf5b4 100644 --- a/docs/assets/js/playground.js +++ b/docs/assets/js/playground.js @@ -1,33 +1,87 @@ -let tree; +function initializeCustomSelect() { + const button = document.getElementById('language-button'); + const select = document.getElementById('language-select'); + if (!button || !select) return; -(async () => { - const CAPTURE_REGEX = /@\s*([\w._-]+)/g; - const COLORS_BY_INDEX = [ - 'blue', - 'chocolate', - 'darkblue', - 'darkcyan', - 'darkgreen', - 'darkred', - 'darkslategray', - 'dimgray', - 'green', - 'indigo', - 'navy', - 'red', - 'sienna', + const dropdown = button.nextElementSibling; + const selectedValue = button.querySelector('.selected-value'); + + selectedValue.textContent = select.options[select.selectedIndex].text; + + button.addEventListener('click', (e) => { + e.preventDefault(); // Prevent form submission + dropdown.classList.toggle('show'); + }); + + document.addEventListener('click', (e) => { + if (!button.contains(e.target)) { + dropdown.classList.remove('show'); + } + }); + + dropdown.querySelectorAll('.option').forEach(option => { + option.addEventListener('click', () => { + selectedValue.textContent = option.textContent; + select.value = option.dataset.value; + dropdown.classList.remove('show'); + + const event = new Event('change'); + select.dispatchEvent(event); + }); + }); +} + +window.initializePlayground = async function initializePlayground() { + initializeCustomSelect(); + + let tree; + + const CAPTURE_REGEX = /@\s*([\w\._-]+)/g; + const LIGHT_COLORS = [ + "#0550ae", // blue + "#ab5000", // rust brown + "#116329", // forest green + "#844708", // warm brown + "#6639ba", // purple + "#7d4e00", // orange brown + "#0969da", // bright blue + "#1a7f37", // green + "#cf222e", // red + "#8250df", // violet + "#6e7781", // gray + "#953800", // dark orange + "#1b7c83" // teal ]; - const codeInput = document.getElementById('code-input'); - const languageSelect = document.getElementById('language-select'); - const loggingCheckbox = document.getElementById('logging-checkbox'); - const outputContainer = document.getElementById('output-container'); - const outputContainerScroll = document.getElementById('output-container-scroll'); - const playgroundContainer = document.getElementById('playground-container'); - const queryCheckbox = document.getElementById('query-checkbox'); - const queryContainer = document.getElementById('query-container'); - const queryInput = document.getElementById('query-input'); - const updateTimeSpan = document.getElementById('update-time'); + const DARK_COLORS = [ + "#79c0ff", // light blue + "#ffa657", // orange + "#7ee787", // light green + "#ff7b72", // salmon + "#d2a8ff", // light purple + "#ffa198", // pink + "#a5d6ff", // pale blue + "#56d364", // bright green + "#ff9492", // light red + "#e0b8ff", // pale purple + "#9ca3af", // gray + "#ffb757", // yellow orange + "#80cbc4" // light teal + ]; + + const codeInput = document.getElementById("code-input"); + const languageSelect = document.getElementById("language-select"); + const loggingCheckbox = document.getElementById("logging-checkbox"); + const anonymousNodes = document.getElementById('anonymous-nodes-checkbox'); + const outputContainer = document.getElementById("output-container"); + const outputContainerScroll = document.getElementById( + "output-container-scroll", + ); + const playgroundContainer = document.getElementById("playground-container"); + const queryCheckbox = document.getElementById("query-checkbox"); + const queryContainer = document.getElementById("query-container"); + const queryInput = document.getElementById("query-input"); + const updateTimeSpan = document.getElementById("update-time"); const languagesByName = {}; loadState(); @@ -35,21 +89,36 @@ let tree; await TreeSitter.init(); const parser = new TreeSitter(); + + console.log(parser, codeInput, queryInput); + const codeEditor = CodeMirror.fromTextArea(codeInput, { lineNumbers: true, showCursorWhenSelecting: true }); + codeEditor.on('keydown', (_, event) => { + if (event.key === 'ArrowLeft' || event.key === 'ArrowRight') { + event.stopPropagation(); // Prevent mdBook from going back/forward + } + }); + const queryEditor = CodeMirror.fromTextArea(queryInput, { lineNumbers: true, - showCursorWhenSelecting: true + showCursorWhenSelecting: true, + }); + + queryEditor.on('keydown', (_, event) => { + if (event.key === 'ArrowLeft' || event.key === 'ArrowRight') { + event.stopPropagation(); // Prevent mdBook from going back/forward + } }); const cluster = new Clusterize({ rows: [], noDataText: null, contentElem: outputContainer, - scrollElem: outputContainerScroll + scrollElem: outputContainerScroll, }); const renderTreeOnCodeChange = debounce(renderTree, 50); const saveStateOnChange = debounce(saveState, 2000); @@ -62,32 +131,33 @@ let tree; let isRendering = 0; let query; - codeEditor.on('changes', handleCodeChange); - codeEditor.on('viewportChange', runTreeQueryOnChange); - codeEditor.on('cursorActivity', debounce(handleCursorMovement, 150)); - queryEditor.on('changes', debounce(handleQueryChange, 150)); + codeEditor.on("changes", handleCodeChange); + codeEditor.on("viewportChange", runTreeQueryOnChange); + codeEditor.on("cursorActivity", debounce(handleCursorMovement, 150)); + queryEditor.on("changes", debounce(handleQueryChange, 150)); - loggingCheckbox.addEventListener('change', handleLoggingChange); - queryCheckbox.addEventListener('change', handleQueryEnableChange); - languageSelect.addEventListener('change', handleLanguageChange); - outputContainer.addEventListener('click', handleTreeClick); + loggingCheckbox.addEventListener("change", handleLoggingChange); + anonymousNodes.addEventListener('change', renderTree); + queryCheckbox.addEventListener("change", handleQueryEnableChange); + languageSelect.addEventListener("change", handleLanguageChange); + outputContainer.addEventListener("click", handleTreeClick); handleQueryEnableChange(); - await handleLanguageChange() + await handleLanguageChange(); - playgroundContainer.style.visibility = 'visible'; + playgroundContainer.style.visibility = "visible"; async function handleLanguageChange() { const newLanguageName = languageSelect.value; if (!languagesByName[newLanguageName]) { - const url = `${LANGUAGE_BASE_URL}/tree-sitter-${newLanguageName}.wasm` + const url = `${LANGUAGE_BASE_URL}/tree-sitter-${newLanguageName}.wasm`; languageSelect.disabled = true; try { languagesByName[newLanguageName] = await TreeSitter.Language.load(url); } catch (e) { console.error(e); languageSelect.value = languageName; - return + return; } finally { languageSelect.disabled = false; } @@ -100,8 +170,8 @@ let tree; handleQueryChange(); } - async function handleCodeChange(_editor, changes) { - const newText = `${codeEditor.getValue()}\n`; + async function handleCodeChange(editor, changes) { + const newText = codeEditor.getValue() + "\n"; const edits = tree && changes && changes.map(treeEditForEditorChange); const start = performance.now(); @@ -126,16 +196,16 @@ let tree; isRendering++; const cursor = tree.walk(); - const currentRenderCount = parseCount; - let row = ''; - const rows = []; + let currentRenderCount = parseCount; + let row = ""; + let rows = []; let finishedRow = false; let visitedChildren = false; let indentLevel = 0; - for (let i = 0;; i++) { + for (let i = 0; ; i++) { if (i > 0 && i % 10000 === 0) { - await new Promise(r => setTimeout(r, 0)); + await new Promise((r) => setTimeout(r, 0)); if (parseCount !== currentRenderCount) { cursor.delete(); isRendering--; @@ -144,10 +214,14 @@ let tree; } let displayName; + let displayClass = 'plain'; if (cursor.nodeIsMissing) { - displayName = `MISSING ${cursor.nodeType}` + const nodeTypeText = cursor.nodeIsNamed ? cursor.nodeType : `"${cursor.nodeType}"`; + displayName = `MISSING ${nodeTypeText}`; } else if (cursor.nodeIsNamed) { displayName = cursor.nodeType; + } else if (anonymousNodes.checked) { + displayName = cursor.nodeType } if (visitedChildren) { @@ -166,7 +240,7 @@ let tree; } else { if (displayName) { if (finishedRow) { - row += ''; + row += ""; rows.push(row); finishedRow = false; } @@ -175,11 +249,23 @@ let tree; const id = cursor.nodeId; let fieldName = cursor.currentFieldName; if (fieldName) { - fieldName += ': '; + fieldName += ": "; } else { - fieldName = ''; + fieldName = ""; } - row = `
${' '.repeat(indentLevel)}${fieldName}${displayName} [${start.row}, ${start.column}] - [${end.row}, ${end.column}]`; + + const nodeClass = + displayName === 'ERROR' || displayName.startsWith('MISSING') + ? 'node-link error' + : cursor.nodeIsNamed + ? 'node-link named' + : 'node-link anonymous'; + + row = `
${" ".repeat(indentLevel)}${fieldName}` + + `` + + `${displayName} ` + + `[${start.row}, ${start.column}] - [${end.row}, ${end.column}]`; finishedRow = true; } @@ -192,7 +278,7 @@ let tree; } } if (finishedRow) { - row += '
'; + row += "
"; rows.push(row); } @@ -212,33 +298,48 @@ let tree; codeEditor.operation(() => { const marks = codeEditor.getAllMarks(); - marks.forEach(m => m.clear()); + marks.forEach((m) => m.clear()); if (tree && query) { const captures = query.captures( tree.rootNode, - {row: startRow, column: 0}, - {row: endRow, column: 0}, + { row: startRow, column: 0 }, + { row: endRow, column: 0 }, ); let lastNodeId; - for (const {name, node} of captures) { + for (const { name, node } of captures) { if (node.id === lastNodeId) continue; lastNodeId = node.id; - const {startPosition, endPosition} = node; + const { startPosition, endPosition } = node; codeEditor.markText( - {line: startPosition.row, ch: startPosition.column}, - {line: endPosition.row, ch: endPosition.column}, + { line: startPosition.row, ch: startPosition.column }, + { line: endPosition.row, ch: endPosition.column }, { inclusiveLeft: true, inclusiveRight: true, - css: `color: ${colorForCaptureName(name)}` - } + css: `color: ${colorForCaptureName(name)}`, + }, ); } } }); } + // When we change from a dark theme to a light theme (and vice versa), the colors of the + // captures need to be updated. + const observer = new MutationObserver((mutations) => { + mutations.forEach((mutation) => { + if (mutation.attributeName === 'class') { + handleQueryChange(); + } + }); + }); + + observer.observe(document.documentElement, { + attributes: true, + attributeFilter: ['class'] + }); + function handleQueryChange() { if (query) { query.delete(); @@ -247,7 +348,7 @@ let tree; } queryEditor.operation(() => { - queryEditor.getAllMarks().forEach(m => m.clear()); + queryEditor.getAllMarks().forEach((m) => m.clear()); if (!queryCheckbox.checked) return; const queryText = queryEditor.getValue(); @@ -258,15 +359,15 @@ let tree; let row = 0; queryEditor.eachLine((line) => { - while (match = CAPTURE_REGEX.exec(line.text)) { + while ((match = CAPTURE_REGEX.exec(line.text))) { queryEditor.markText( - {line: row, ch: match.index}, - {line: row, ch: match.index + match[0].length}, + { line: row, ch: match.index }, + { line: row, ch: match.index + match[0].length }, { inclusiveLeft: true, inclusiveRight: true, - css: `color: ${colorForCaptureName(match[1])}` - } + css: `color: ${colorForCaptureName(match[1])}`, + }, ); } row++; @@ -275,7 +376,7 @@ let tree; const startPosition = queryEditor.posFromIndex(error.index); const endPosition = { line: startPosition.line, - ch: startPosition.ch + (error.length || Infinity) + ch: startPosition.ch + (error.length || Infinity), }; if (error.index === queryText.length) { @@ -287,16 +388,12 @@ let tree; } } - queryEditor.markText( - startPosition, - endPosition, - { - className: 'query-error', - inclusiveLeft: true, - inclusiveRight: true, - attributes: {title: error.message} - } - ); + queryEditor.markText(startPosition, endPosition, { + className: "query-error", + inclusiveLeft: true, + inclusiveRight: true, + attributes: { title: error.message }, + }); } }); @@ -308,16 +405,13 @@ let tree; if (isRendering) return; const selection = codeEditor.getDoc().listSelections()[0]; - let start = {row: selection.anchor.line, column: selection.anchor.ch}; - let end = {row: selection.head.line, column: selection.head.ch}; + let start = { row: selection.anchor.line, column: selection.anchor.ch }; + let end = { row: selection.head.line, column: selection.head.ch }; if ( start.row > end.row || - ( - start.row === end.row && - start.column > end.column - ) + (start.row === end.row && start.column > end.column) ) { - const swap = end; + let swap = end; end = start; start = swap; } @@ -325,12 +419,22 @@ let tree; if (treeRows) { if (treeRowHighlightedIndex !== -1) { const row = treeRows[treeRowHighlightedIndex]; - if (row) treeRows[treeRowHighlightedIndex] = row.replace('highlighted', 'plain'); + if (row) + treeRows[treeRowHighlightedIndex] = row.replace( + "highlighted", + "plain", + ); } - treeRowHighlightedIndex = treeRows.findIndex(row => row.includes(`data-id=${node.id}`)); + treeRowHighlightedIndex = treeRows.findIndex((row) => + row.includes(`data-id=${node.id}`), + ); if (treeRowHighlightedIndex !== -1) { const row = treeRows[treeRowHighlightedIndex]; - if (row) treeRows[treeRowHighlightedIndex] = row.replace('plain', 'highlighted'); + if (row) + treeRows[treeRowHighlightedIndex] = row.replace( + "plain", + "highlighted", + ); } cluster.update(treeRows); const lineHeight = cluster.options.item_height; @@ -338,26 +442,25 @@ let tree; const containerHeight = outputContainerScroll.clientHeight; const offset = treeRowHighlightedIndex * lineHeight; if (scrollTop > offset - 20) { - $(outputContainerScroll).animate({scrollTop: offset - 20}, 150); + $(outputContainerScroll).animate({ scrollTop: offset - 20 }, 150); } else if (scrollTop < offset + lineHeight + 40 - containerHeight) { - $(outputContainerScroll).animate({scrollTop: offset - containerHeight + 40}, 150); + $(outputContainerScroll).animate( + { scrollTop: offset - containerHeight + 40 }, + 150, + ); } } } function handleTreeClick(event) { - if (event.target.tagName === 'A') { + if (event.target.tagName === "A") { event.preventDefault(); - const [startRow, startColumn, endRow, endColumn] = event - .target - .dataset - .range - .split(',') - .map(n => parseInt(n)); + const [startRow, startColumn, endRow, endColumn] = + event.target.dataset.range.split(",").map((n) => parseInt(n)); codeEditor.focus(); codeEditor.setSelection( - {line: startRow, ch: startColumn}, - {line: endRow, ch: endColumn} + { line: startRow, ch: startColumn }, + { line: endRow, ch: endColumn }, ); } } @@ -366,9 +469,9 @@ let tree; if (loggingCheckbox.checked) { parser.setLogger((message, lexing) => { if (lexing) { - console.log(" ", message) + console.log(" ", message); } else { - console.log(message) + console.log(message); } }); } else { @@ -378,11 +481,11 @@ let tree; function handleQueryEnableChange() { if (queryCheckbox.checked) { - queryContainer.style.visibility = ''; - queryContainer.style.position = ''; + queryContainer.style.visibility = ""; + queryContainer.style.position = ""; } else { - queryContainer.style.visibility = 'hidden'; - queryContainer.style.position = 'absolute'; + queryContainer.style.visibility = "hidden"; + queryContainer.style.position = "absolute"; } handleQueryChange(); } @@ -392,48 +495,62 @@ let tree; const newLineCount = change.text.length; const lastLineLength = change.text[newLineCount - 1].length; - const startPosition = {row: change.from.line, column: change.from.ch}; - const oldEndPosition = {row: change.to.line, column: change.to.ch}; + const startPosition = { row: change.from.line, column: change.from.ch }; + const oldEndPosition = { row: change.to.line, column: change.to.ch }; const newEndPosition = { row: startPosition.row + newLineCount - 1, - column: newLineCount === 1 - ? startPosition.column + lastLineLength - : lastLineLength + column: + newLineCount === 1 + ? startPosition.column + lastLineLength + : lastLineLength, }; const startIndex = codeEditor.indexFromPos(change.from); let newEndIndex = startIndex + newLineCount - 1; let oldEndIndex = startIndex + oldLineCount - 1; for (let i = 0; i < newLineCount; i++) newEndIndex += change.text[i].length; - for (let i = 0; i < oldLineCount; i++) oldEndIndex += change.removed[i].length; + for (let i = 0; i < oldLineCount; i++) + oldEndIndex += change.removed[i].length; return { - startIndex, oldEndIndex, newEndIndex, - startPosition, oldEndPosition, newEndPosition + startIndex, + oldEndIndex, + newEndIndex, + startPosition, + oldEndPosition, + newEndPosition, }; } function colorForCaptureName(capture) { const id = query.captureNames.indexOf(capture); - return COLORS_BY_INDEX[id % COLORS_BY_INDEX.length]; + const isDark = document.querySelector('html').classList.contains('ayu') || + document.querySelector('html').classList.contains('coal') || + document.querySelector('html').classList.contains('navy'); + + const colors = isDark ? DARK_COLORS : LIGHT_COLORS; + return colors[id % colors.length]; } function loadState() { const language = localStorage.getItem("language"); const sourceCode = localStorage.getItem("sourceCode"); + const anonNodes = localStorage.getItem("anonymousNodes"); const query = localStorage.getItem("query"); const queryEnabled = localStorage.getItem("queryEnabled"); if (language != null && sourceCode != null && query != null) { queryInput.value = query; codeInput.value = sourceCode; languageSelect.value = language; - queryCheckbox.checked = (queryEnabled === 'true'); + anonymousNodes.checked = anonNodes === "true"; + queryCheckbox.checked = queryEnabled === "true"; } } function saveState() { localStorage.setItem("language", languageSelect.value); localStorage.setItem("sourceCode", codeEditor.getValue()); + localStorage.setItem("anonymousNodes", anonymousNodes.checked); saveQueryState(); } @@ -443,17 +560,18 @@ let tree; } function debounce(func, wait, immediate) { - let timeout; - return function() { - const context = this, args = arguments; - const later = function() { + var timeout; + return function () { + var context = this, + args = arguments; + var later = function () { timeout = null; if (!immediate) func.apply(context, args); }; - const callNow = immediate && !timeout; + var callNow = immediate && !timeout; clearTimeout(timeout); timeout = setTimeout(later, wait); if (callNow) func.apply(context, args); }; } -})(); +}; diff --git a/docs/book.toml b/docs/book.toml new file mode 100644 index 00000000..da1674a7 --- /dev/null +++ b/docs/book.toml @@ -0,0 +1,24 @@ +[book] +authors = [ + "Max Brunsfeld ", + "Amaan Qureshi ", +] +language = "en" +multilingual = false +src = "src" +title = "Tree-sitter" + +[output.html] +additional-css = ["assets/css/playground.css"] +additional-js = ["assets/js/playground.js"] +git-repository-url = "https://github.com/tree-sitter/tree-sitter" +git-repository-icon = "fa-github" +edit-url-template = "https://github.com/tree-sitter/tree-sitter/edit/master/docs/{path}" + +[output.html.search] +limit-results = 20 +use-boolean-and = true +boost-title = 2 +boost-hierarchy = 2 +boost-paragraph = 1 +expand = true diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 9657b062..00000000 --- a/docs/index.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -title: Introduction ---- - -# Introduction - -Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be: - -* **General** enough to parse any programming language -* **Fast** enough to parse on every keystroke in a text editor -* **Robust** enough to provide useful results even in the presence of syntax errors -* **Dependency-free** so that the runtime library (which is written in pure [C](https://github.com/tree-sitter/tree-sitter/tree/master/lib)) can be embedded in any application - -### Language Bindings - -There are currently bindings that allow Tree-sitter to be used from the following languages: - -#### Official - -* [C#](https://github.com/tree-sitter/csharp-tree-sitter) -* [Go](https://github.com/tree-sitter/go-tree-sitter) -* [Haskell](https://github.com/tree-sitter/haskell-tree-sitter) -* [Java (JDK 22)](https://github.com/tree-sitter/java-tree-sitter) -* [JavaScript (Node.js)](https://github.com/tree-sitter/node-tree-sitter) -* [JavaScript (Wasm)](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) -* [Kotlin](https://github.com/tree-sitter/kotlin-tree-sitter) -* [Python](https://github.com/tree-sitter/py-tree-sitter) -* [Rust](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) - -#### Third-party - -* [Delphi](https://github.com/modersohn/delphi-tree-sitter) -* [ELisp](https://www.gnu.org/software/emacs/manual/html_node/elisp/Parsing-Program-Source.html) -* [Go](https://github.com/smacker/go-tree-sitter) -* [Guile](https://github.com/Z572/guile-ts) -* [Java (Android)](https://github.com/AndroidIDEOfficial/android-tree-sitter) -* [Java (JDK 8+)](https://github.com/bonede/tree-sitter-ng) -* [Java (JDK 11+)](https://github.com/seart-group/java-tree-sitter) -* [Julia](https://github.com/MichaelHatherly/TreeSitter.jl) -* [Lua](https://github.com/euclidianAce/ltreesitter) -* [Lua](https://github.com/xcb-xwii/lua-tree-sitter) -* [OCaml](https://github.com/semgrep/ocaml-tree-sitter-core) -* [Odin](https://github.com/laytan/odin-tree-sitter) -* [Perl](https://metacpan.org/pod/Text::Treesitter) -* [R](https://github.com/DavisVaughan/r-tree-sitter) -* [Ruby](https://github.com/Faveod/ruby-tree-sitter) -* [Swift](https://github.com/ChimeHQ/SwiftTreeSitter) - -### Parsers - -A list of known parsers can be found in the [wiki](https://github.com/tree-sitter/tree-sitter/wiki/List-of-parsers). - -### Talks on Tree-sitter - -* [Strange Loop 2018](https://www.thestrangeloop.com/2018/tree-sitter---a-new-parsing-system-for-programming-tools.html) -* [FOSDEM 2018](https://www.youtube.com/watch?v=0CGzC_iss-8) -* [GitHub Universe 2017](https://www.youtube.com/watch?v=a1rC79DHpmY) - -### Underlying Research - -The design of Tree-sitter was greatly influenced by the following research papers: - -* [Practical Algorithms for Incremental Software Development Environments](https://www2.eecs.berkeley.edu/Pubs/TechRpts/1997/CSD-97-946.pdf) -* [Context Aware Scanning for Parsing Extensible Languages](https://www-users.cse.umn.edu/~evw/pubs/vanwyk07gpce/vanwyk07gpce.pdf) -* [Efficient and Flexible Incremental Parsing](https://harmonia.cs.berkeley.edu/papers/twagner-parsing.pdf) -* [Incremental Analysis of Real Programming Languages](https://harmonia.cs.berkeley.edu/papers/twagner-glr.pdf) -* [Error Detection and Recovery in LR Parsers](https://web.archive.org/web/20240302031213/https://what-when-how.com/compiler-writing/bottom-up-parsing-compiler-writing-part-13/) -* [Error Recovery for LR Parsers](https://apps.dtic.mil/sti/pdfs/ADA043470.pdf) diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md deleted file mode 100644 index 7b54a44e..00000000 --- a/docs/section-2-using-parsers.md +++ /dev/null @@ -1,996 +0,0 @@ ---- -title: Using Parsers -permalink: using-parsers ---- - -# Using Parsers - -All of Tree-sitter's parsing functionality is exposed through C APIs. Applications written in higher-level languages can use Tree-sitter via binding libraries like [node-tree-sitter](https://github.com/tree-sitter/node-tree-sitter) or the [tree-sitter rust crate](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust), which have their own documentation. - -This document will describe the general concepts of how to use Tree-sitter, which should be relevant regardless of what language you're using. It also goes into some C-specific details that are useful if you're using the C API directly or are building a new binding to a different language. - -All of the API functions shown here are declared and documented in the [`tree_sitter/api.h`](https://github.com/tree-sitter/tree-sitter/blob/master/lib/include/tree_sitter/api.h) header file. You may also want to browse the [online Rust API docs](https://docs.rs/tree-sitter), which correspond to the C APIs closely. - -## Getting Started - -### Building the Library - -To build the library on a POSIX system, just run `make` in the Tree-sitter directory. This will create a static library called `libtree-sitter.a` as well as dynamic libraries. - -Alternatively, you can incorporate the library in a larger project's build system by adding one source file to the build. This source file needs two directories to be in the include path when compiled: - -**source file:** - -- `tree-sitter/lib/src/lib.c` - -**include directories:** - -- `tree-sitter/lib/src` -- `tree-sitter/lib/include` - -### The Basic Objects - -There are four main types of objects involved when using Tree-sitter: languages, parsers, syntax trees, and syntax nodes. In C, these are called `TSLanguage`, `TSParser`, `TSTree`, and `TSNode`. - -- A `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next page](./creating-parsers) for how to create new languages. -- A `TSParser` is a stateful object that can be assigned a `TSLanguage` and used to produce a `TSTree` based on some source code. -- A `TSTree` represents the syntax tree of an entire source code file. It contains `TSNode` instances that indicate the structure of the source code. It can also be edited and used to produce a new `TSTree` in the event that the source code changes. -- A `TSNode` represents a single node in the syntax tree. It tracks its start and end positions in the source code, as well as its relation to other nodes like its parent, siblings and children. - -### An Example Program - -Here's an example of a simple C program that uses the Tree-sitter [JSON parser](https://github.com/tree-sitter/tree-sitter-json). - -```c -// Filename - test-json-parser.c - -#include -#include -#include -#include - -// Declare the `tree_sitter_json` function, which is -// implemented by the `tree-sitter-json` library. -const TSLanguage *tree_sitter_json(void); - -int main() { - // Create a parser. - TSParser *parser = ts_parser_new(); - - // Set the parser's language (JSON in this case). - ts_parser_set_language(parser, tree_sitter_json()); - - // Build a syntax tree based on source code stored in a string. - const char *source_code = "[1, null]"; - TSTree *tree = ts_parser_parse_string( - parser, - NULL, - source_code, - strlen(source_code) - ); - - // Get the root node of the syntax tree. - TSNode root_node = ts_tree_root_node(tree); - - // Get some child nodes. - TSNode array_node = ts_node_named_child(root_node, 0); - TSNode number_node = ts_node_named_child(array_node, 0); - - // Check that the nodes have the expected types. - assert(strcmp(ts_node_type(root_node), "document") == 0); - assert(strcmp(ts_node_type(array_node), "array") == 0); - assert(strcmp(ts_node_type(number_node), "number") == 0); - - // Check that the nodes have the expected child counts. - assert(ts_node_child_count(root_node) == 1); - assert(ts_node_child_count(array_node) == 5); - assert(ts_node_named_child_count(array_node) == 2); - assert(ts_node_child_count(number_node) == 0); - - // Print the syntax tree as an S-expression. - char *string = ts_node_string(root_node); - printf("Syntax tree: %s\n", string); - - // Free all of the heap-allocated memory. - free(string); - ts_tree_delete(tree); - ts_parser_delete(parser); - return 0; -} -``` - -This program uses the Tree-sitter C API, which is declared in the header file `tree-sitter/api.h`, so we need to add the `tree-sitter/lib/include` directory to the include path. We also need to link `libtree-sitter.a` into the binary. We compile the source code of the JSON language directly into the binary as well. - -```sh -clang \ - -I tree-sitter/lib/include \ - test-json-parser.c \ - tree-sitter-json/src/parser.c \ - tree-sitter/libtree-sitter.a \ - -o test-json-parser - -./test-json-parser -``` - -## Basic Parsing - -### Providing the Code - -In the example above, we parsed source code stored in a simple string using the `ts_parser_parse_string` function: - -```c -TSTree *ts_parser_parse_string( - TSParser *self, - const TSTree *old_tree, - const char *string, - uint32_t length -); -``` - -You may want to parse source code that's stored in a custom data structure, like a [piece table](https://en.wikipedia.org/wiki/Piece_table) or a [rope](). In this case, you can use the more general `ts_parser_parse` function: - -```c -TSTree *ts_parser_parse( - TSParser *self, - const TSTree *old_tree, - TSInput input -); -``` - -The `TSInput` structure lets you provide your own function for reading a chunk of text at a given byte offset and row/column position. The function can return text encoded in either UTF8 or UTF16. This interface allows you to efficiently parse text that is stored in your own data structure. - -```c -typedef struct { - void *payload; - const char *(*read)( - void *payload, - uint32_t byte_offset, - TSPoint position, - uint32_t *bytes_read - ); - TSInputEncoding encoding; - DecodeFunction decode; -} TSInput; -``` - -In the event that you want to decode text that is not encoded in UTF-8 or UTF16, then you can set the `decode` field of the input to your function that will decode text. The signature of the `DecodeFunction` is as follows: - -```c -typedef uint32_t (*DecodeFunction)( - const uint8_t *string, - uint32_t length, - int32_t *code_point -); -``` - -The `string` argument is a pointer to the text to decode, which comes from the `read` function, and the `length` argument is the length of the `string`. The `code_point` argument is a pointer to an integer that represents the decoded code point, and should be written to in your `decode` callback. The function should return the number of bytes decoded. - -### Syntax Nodes - -Tree-sitter provides a [DOM](https://en.wikipedia.org/wiki/Document_Object_Model)-style interface for inspecting syntax trees. A syntax node's _type_ is a string that indicates which grammar rule the node represents. - -```c -const char *ts_node_type(TSNode); -``` - -Syntax nodes store their position in the source code both in terms of raw bytes and row/column coordinates. -In a point, rows and columns are zero-based. The `row` field represents the number of newlines before a given -position, while `column` represents the number of bytes between the position and beginning of the line. - -```c -uint32_t ts_node_start_byte(TSNode); -uint32_t ts_node_end_byte(TSNode); - -typedef struct { - uint32_t row; - uint32_t column; -} TSPoint; - -TSPoint ts_node_start_point(TSNode); -TSPoint ts_node_end_point(TSNode); -``` - -### Retrieving Nodes - -Every tree has a _root node_: - -```c -TSNode ts_tree_root_node(const TSTree *); -``` - -Once you have a node, you can access the node's children: - -```c -uint32_t ts_node_child_count(TSNode); -TSNode ts_node_child(TSNode, uint32_t); -``` - -You can also access its siblings and parent: - -```c -TSNode ts_node_next_sibling(TSNode); -TSNode ts_node_prev_sibling(TSNode); -TSNode ts_node_parent(TSNode); -``` - -These methods may all return a _null node_ to indicate, for example, that a node does not _have_ a next sibling. You can check if a node is null: - -```c -bool ts_node_is_null(TSNode); -``` - -### Named vs Anonymous Nodes - -Tree-sitter produces [_concrete_ syntax trees](https://en.wikipedia.org/wiki/Parse_tree) - trees that contain nodes for every individual token in the source code, including things like commas and parentheses. This is important for use-cases that deal with individual tokens, like [syntax highlighting](https://en.wikipedia.org/wiki/Syntax_highlighting). But some types of code analysis are easier to perform using an [_abstract_ syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) - a tree in which the less important details have been removed. Tree-sitter's trees support these use cases by making a distinction between _named_ and _anonymous_ nodes. - -Consider a grammar rule like this: - -```js -if_statement: ($) => seq("if", "(", $._expression, ")", $._statement); -``` - -A syntax node representing an `if_statement` in this language would have 5 children: the condition expression, the body statement, as well as the `if`, `(`, and `)` tokens. The expression and the statement would be marked as _named_ nodes, because they have been given explicit names in the grammar. But the `if`, `(`, and `)` nodes would _not_ be named nodes, because they are represented in the grammar as simple strings. - -You can check whether any given node is named: - -```c -bool ts_node_is_named(TSNode); -``` - -When traversing the tree, you can also choose to skip over anonymous nodes by using the `_named_` variants of all of the methods described above: - -```c -TSNode ts_node_named_child(TSNode, uint32_t); -uint32_t ts_node_named_child_count(TSNode); -TSNode ts_node_next_named_sibling(TSNode); -TSNode ts_node_prev_named_sibling(TSNode); -``` - -If you use this group of methods, the syntax tree functions much like an abstract syntax tree. - -### Node Field Names - -To make syntax nodes easier to analyze, many grammars assign unique _field names_ to particular child nodes. The next page [explains](./creating-parsers#using-fields) how to do this on your own grammars. If a syntax node has fields, you can access its children using their field name: - -```c -TSNode ts_node_child_by_field_name( - TSNode self, - const char *field_name, - uint32_t field_name_length -); -``` - -Fields also have numeric ids that you can use, if you want to avoid repeated string comparisons. You can convert between strings and ids using the `TSLanguage`: - -```c -uint32_t ts_language_field_count(const TSLanguage *); -const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId); -TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t); -``` - -The field ids can be used in place of the name: - -```c -TSNode ts_node_child_by_field_id(TSNode, TSFieldId); -``` - -## Advanced Parsing - -### Editing - -In applications like text editors, you often need to re-parse a file after its source code has changed. Tree-sitter is designed to support this use case efficiently. There are two steps required. First, you must _edit_ the syntax tree, which adjusts the ranges of its nodes so that they stay in sync with the code. - -```c -typedef struct { - uint32_t start_byte; - uint32_t old_end_byte; - uint32_t new_end_byte; - TSPoint start_point; - TSPoint old_end_point; - TSPoint new_end_point; -} TSInputEdit; - -void ts_tree_edit(TSTree *, const TSInputEdit *); -``` - -Then, you can call `ts_parser_parse` again, passing in the old tree. This will create a new tree that internally shares structure with the old tree. - -When you edit a syntax tree, the positions of its nodes will change. If you have stored any `TSNode` instances outside of the `TSTree`, you must update their positions separately, using the same `TSInput` value, in order to update their cached positions. - -```c -void ts_node_edit(TSNode *, const TSInputEdit *); -``` - -This `ts_node_edit` function is _only_ needed in the case where you have retrieved `TSNode` instances _before_ editing the tree, and then _after_ editing the tree, you want to continue to use those specific node instances. Often, you'll just want to re-fetch nodes from the edited tree, in which case `ts_node_edit` is not needed. - -### Multi-language Documents - -Sometimes, different parts of a file may be written in different languages. For example, templating languages like [EJS](https://ejs.co) and [ERB](https://ruby-doc.org/stdlib-2.5.1/libdoc/erb/rdoc/ERB.html) allow you to generate HTML by writing a mixture of HTML and another language like JavaScript or Ruby. - -Tree-sitter handles these types of documents by allowing you to create a syntax tree based on the text in certain _ranges_ of a file. - -```c -typedef struct { - TSPoint start_point; - TSPoint end_point; - uint32_t start_byte; - uint32_t end_byte; -} TSRange; - -void ts_parser_set_included_ranges( - TSParser *self, - const TSRange *ranges, - uint32_t range_count -); -``` - -For example, consider this ERB document: - -```erb -
    - <% people.each do |person| %> -
  • <%= person.name %>
  • - <% end %> -
-``` - -Conceptually, it can be represented by three syntax trees with overlapping ranges: an ERB syntax tree, a Ruby syntax tree, and an HTML syntax tree. You could generate these syntax trees with the following code: - -```c -#include -#include - -// These functions are each implemented in their own repo. -const TSLanguage *tree_sitter_embedded_template(void); -const TSLanguage *tree_sitter_html(void); -const TSLanguage *tree_sitter_ruby(void); - -int main(int argc, const char **argv) { - const char *text = argv[1]; - unsigned len = strlen(text); - - // Parse the entire text as ERB. - TSParser *parser = ts_parser_new(); - ts_parser_set_language(parser, tree_sitter_embedded_template()); - TSTree *erb_tree = ts_parser_parse_string(parser, NULL, text, len); - TSNode erb_root_node = ts_tree_root_node(erb_tree); - - // In the ERB syntax tree, find the ranges of the `content` nodes, - // which represent the underlying HTML, and the `code` nodes, which - // represent the interpolated Ruby. - TSRange html_ranges[10]; - TSRange ruby_ranges[10]; - unsigned html_range_count = 0; - unsigned ruby_range_count = 0; - unsigned child_count = ts_node_child_count(erb_root_node); - - for (unsigned i = 0; i < child_count; i++) { - TSNode node = ts_node_child(erb_root_node, i); - if (strcmp(ts_node_type(node), "content") == 0) { - html_ranges[html_range_count++] = (TSRange) { - ts_node_start_point(node), - ts_node_end_point(node), - ts_node_start_byte(node), - ts_node_end_byte(node), - }; - } else { - TSNode code_node = ts_node_named_child(node, 0); - ruby_ranges[ruby_range_count++] = (TSRange) { - ts_node_start_point(code_node), - ts_node_end_point(code_node), - ts_node_start_byte(code_node), - ts_node_end_byte(code_node), - }; - } - } - - // Use the HTML ranges to parse the HTML. - ts_parser_set_language(parser, tree_sitter_html()); - ts_parser_set_included_ranges(parser, html_ranges, html_range_count); - TSTree *html_tree = ts_parser_parse_string(parser, NULL, text, len); - TSNode html_root_node = ts_tree_root_node(html_tree); - - // Use the Ruby ranges to parse the Ruby. - ts_parser_set_language(parser, tree_sitter_ruby()); - ts_parser_set_included_ranges(parser, ruby_ranges, ruby_range_count); - TSTree *ruby_tree = ts_parser_parse_string(parser, NULL, text, len); - TSNode ruby_root_node = ts_tree_root_node(ruby_tree); - - // Print all three trees. - char *erb_sexp = ts_node_string(erb_root_node); - char *html_sexp = ts_node_string(html_root_node); - char *ruby_sexp = ts_node_string(ruby_root_node); - printf("ERB: %s\n", erb_sexp); - printf("HTML: %s\n", html_sexp); - printf("Ruby: %s\n", ruby_sexp); - return 0; -} -``` - -This API allows for great flexibility in how languages can be composed. Tree-sitter is not responsible for mediating the interactions between languages. Instead, you are free to do that using arbitrary application-specific logic. - -### Concurrency - -Tree-sitter supports multi-threaded use cases by making syntax trees very cheap to copy. - -```c -TSTree *ts_tree_copy(const TSTree *); -``` - -Internally, copying a syntax tree just entails incrementing an atomic reference count. Conceptually, it provides you a new tree which you can freely query, edit, reparse, or delete on a new thread while continuing to use the original tree on a different thread. Note that individual `TSTree` instances are _not_ thread safe; you must copy a tree if you want to use it on multiple threads simultaneously. - -## Other Tree Operations - -### Walking Trees with Tree Cursors - -You can access every node in a syntax tree using the `TSNode` APIs [described above](#retrieving-nodes), but if you need to access a large number of nodes, the fastest way to do so is with a _tree cursor_. A cursor is a stateful object that allows you to walk a syntax tree with maximum efficiency. - -Note that the given input node is considered the root of the cursor, and the -cursor cannot walk outside this node, so going to the parent or any sibling -of the root node will return `false`. This has no unexpected effects if the given -input node is the actual `root` node of the tree, but is something to keep in mind -when using nodes that are not the `root` node. - -You can initialize a cursor from any node: - -```c -TSTreeCursor ts_tree_cursor_new(TSNode); -``` - -You can move the cursor around the tree: - -```c -bool ts_tree_cursor_goto_first_child(TSTreeCursor *); -bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); -bool ts_tree_cursor_goto_parent(TSTreeCursor *); -``` - -These methods return `true` if the cursor successfully moved and `false` if there was no node to move to. - -You can always retrieve the cursor's current node, as well as the [field name](#node-field-names) that is associated with the current node. - -```c -TSNode ts_tree_cursor_current_node(const TSTreeCursor *); -const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); -TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *); -``` - -## Pattern Matching with Queries - -Many code analysis tasks involve searching for patterns in syntax trees. Tree-sitter provides a small declarative language for expressing these patterns and searching for matches. The language is similar to the format of Tree-sitter's [unit test system](./creating-parsers#command-test). - -### Query Syntax - -A _query_ consists of one or more _patterns_, where each pattern is an [S-expression](https://en.wikipedia.org/wiki/S-expression) that matches a certain set of nodes in a syntax tree. The expression to match a given node consists of a pair of parentheses containing two things: the node's type, and optionally, a series of other S-expressions that match the node's children. For example, this pattern would match any `binary_expression` node whose children are both `number_literal` nodes: - -```scheme -(binary_expression (number_literal) (number_literal)) -``` - -Children can also be omitted. For example, this would match any `binary_expression` where at least _one_ of child is a `string_literal` node: - -```scheme -(binary_expression (string_literal)) -``` - -#### Fields - -In general, it's a good idea to make patterns more specific by specifying [field names](#node-field-names) associated with child nodes. You do this by prefixing a child pattern with a field name followed by a colon. For example, this pattern would match an `assignment_expression` node where the `left` child is a `member_expression` whose `object` is a `call_expression`. - -```scheme -(assignment_expression - left: (member_expression - object: (call_expression))) -``` - -#### Negated Fields - -You can also constrain a pattern so that it only matches nodes that _lack_ a certain field. To do this, add a field name prefixed by a `!` within the parent pattern. For example, this pattern would match a class declaration with no type parameters: - -```scheme -(class_declaration - name: (identifier) @class_name - !type_parameters) -``` - -#### Anonymous Nodes - -The parenthesized syntax for writing nodes only applies to [named nodes](#named-vs-anonymous-nodes). To match specific anonymous nodes, you write their name between double quotes. For example, this pattern would match any `binary_expression` where the operator is `!=` and the right side is `null`: - -```scheme -(binary_expression - operator: "!=" - right: (null)) -``` - -#### Capturing Nodes - -When matching patterns, you may want to process specific nodes within the pattern. Captures allow you to associate names with specific nodes in a pattern, so that you can later refer to those nodes by those names. Capture names are written _after_ the nodes that they refer to, and start with an `@` character. - -For example, this pattern would match any assignment of a `function` to an `identifier`, and it would associate the name `the-function-name` with the identifier: - -```scheme -(assignment_expression - left: (identifier) @the-function-name - right: (function)) -``` - -And this pattern would match all method definitions, associating the name `the-method-name` with the method name, `the-class-name` with the containing class name: - -```scheme -(class_declaration - name: (identifier) @the-class-name - body: (class_body - (method_definition - name: (property_identifier) @the-method-name))) -``` - -#### Quantification Operators - -You can match a repeating sequence of sibling nodes using the postfix `+` and `*` _repetition_ operators, which work analogously to the `+` and `*` operators [in regular expressions](https://en.wikipedia.org/wiki/Regular_expression#Basic_concepts). The `+` operator matches _one or more_ repetitions of a pattern, and the `*` operator matches _zero or more_. - -For example, this pattern would match a sequence of one or more comments: - -```scheme -(comment)+ -``` - -This pattern would match a class declaration, capturing all of the decorators if any were present: - -```scheme -(class_declaration - (decorator)* @the-decorator - name: (identifier) @the-name) -``` - -You can also mark a node as optional using the `?` operator. For example, this pattern would match all function calls, capturing a string argument if one was present: - -```scheme -(call_expression - function: (identifier) @the-function - arguments: (arguments (string)? @the-string-arg)) -``` - -#### Grouping Sibling Nodes - -You can also use parentheses for grouping a sequence of _sibling_ nodes. For example, this pattern would match a comment followed by a function declaration: - -```scheme -( - (comment) - (function_declaration) -) -``` - -Any of the quantification operators mentioned above (`+`, `*`, and `?`) can also be applied to groups. For example, this pattern would match a comma-separated series of numbers: - -```scheme -( - (number) - ("," (number))* -) -``` - -#### Alternations - -An alternation is written as a pair of square brackets (`[]`) containing a list of alternative patterns. -This is similar to _character classes_ from regular expressions (`[abc]` matches either a, b, or c). - -For example, this pattern would match a call to either a variable or an object property. -In the case of a variable, capture it as `@function`, and in the case of a property, capture it as `@method`: - -```scheme -(call_expression - function: [ - (identifier) @function - (member_expression - property: (property_identifier) @method) - ]) -``` - -This pattern would match a set of possible keyword tokens, capturing them as `@keyword`: - -```scheme -[ - "break" - "delete" - "else" - "for" - "function" - "if" - "return" - "try" - "while" -] @keyword -``` - -#### Wildcard Node - -A wildcard node is represented with an underscore (`_`), it matches any node. -This is similar to `.` in regular expressions. -There are two types, `(_)` will match any named node, -and `_` will match any named or anonymous node. - -For example, this pattern would match any node inside a call: - -```scheme -(call (_) @call.inner) -``` - -#### Special Nodes - -When the parser encounters text it does not recognize, it represents this node -as `(ERROR)` in the syntax tree. These error nodes can be queried just like -normal nodes: - -```scheme -(ERROR) @error-node -``` - -Similarly, if a parser is able to recover from erroneous text by inserting a missing token and then reducing, it will insert that missing node in the final tree so long as that tree has the lowest error cost. These missing nodes appear as seemingly normal nodes in the tree, but they are zero tokens wide, and are a property of the actual terminal node that was inserted, instead of being its own kind of node. These special missing nodes can be queried using `(MISSING)`: - -```scheme -(MISSING) @missing-node -``` - -This is useful when attempting to detect all syntax errors in a given parse tree, since these missing node are not captured by `(ERROR)` queries. Specific missing node types can also be queried: - -```scheme -(MISSING identifier) @missing-identifier -(MISSING ";") @missing-semicolon -``` - -#### Anchors - -The anchor operator, `.`, is used to constrain the ways in which child patterns are matched. It has different behaviors depending on where it's placed inside a query. - -When `.` is placed before the _first_ child within a parent pattern, the child will only match when it is the first named node in the parent. For example, the below pattern matches a given `array` node at most once, assigning the `@the-element` capture to the first `identifier` node in the parent `array`: - -```scheme -(array . (identifier) @the-element) -``` - -Without this anchor, the pattern would match once for every identifier in the array, with `@the-element` bound to each matched identifier. - -Similarly, an anchor placed after a pattern's _last_ child will cause that child pattern to only match nodes that are the last named child of their parent. The below pattern matches only nodes that are the last named child within a `block`. - -```scheme -(block (_) @last-expression .) -``` - -Finally, an anchor _between_ two child patterns will cause the patterns to only match nodes that are immediate siblings. The pattern below, given a long dotted name like `a.b.c.d`, will only match pairs of consecutive identifiers: `a, b`, `b, c`, and `c, d`. - -```scheme -(dotted_name - (identifier) @prev-id - . - (identifier) @next-id) -``` - -Without the anchor, non-consecutive pairs like `a, c` and `b, d` would also be matched. - -The restrictions placed on a pattern by an anchor operator ignore anonymous nodes. - -#### Predicates - -You can also specify arbitrary metadata and conditions associated with a pattern -by adding _predicate_ S-expressions anywhere within your pattern. Predicate S-expressions -start with a _predicate name_ beginning with a `#` character. After that, they can -contain an arbitrary number of `@`-prefixed capture names or strings. - -Tree-Sitter's CLI supports the following predicates by default: - -##### eq?, not-eq?, any-eq?, any-not-eq? - -This family of predicates allows you to match against a single capture or string -value. - -The first argument must be a capture, but the second can be either a capture to -compare the two captures' text, or a string to compare first capture's text -against. - -The base predicate is "#eq?", but its complement "#not-eq?" can be used to _not_ -match a value. - -Consider the following example targeting C: - -```scheme -((identifier) @variable.builtin - (#eq? @variable.builtin "self")) -``` - -This pattern would match any identifier that is `self`. - -And this pattern would match key-value pairs where the `value` is an identifier -with the same name as the key: - -```scheme -( - (pair - key: (property_identifier) @key-name - value: (identifier) @value-name) - (#eq? @key-name @value-name) -) -``` - -The prefix "any-" is meant for use with quantified captures. Here's -an example finding a segment of empty comments - -```scheme -((comment)+ @comment.empty - (#any-eq? @comment.empty "//")) -``` - -Note that "#any-eq?" will match a quantified capture if -_any_ of the nodes match the predicate, while by default a quantified capture -will only match if _all_ the nodes match the predicate. - -##### match?, not-match?, any-match?, any-not-match? - -These predicates are similar to the eq? predicates, but they use regular expressions -to match against the capture's text. - -The first argument must be a capture, and the second must be a string containing -a regular expression. - -For example, this pattern would match identifier whose name is written in `SCREAMING_SNAKE_CASE`: - -```scheme -((identifier) @constant - (#match? @constant "^[A-Z][A-Z_]+")) -``` - -Here's an example finding potential documentation comments in C - -```scheme -((comment)+ @comment.documentation - (#match? @comment.documentation "^///\\s+.*")) -``` - -Here's another example finding Cgo comments to potentially inject with C - -```scheme -((comment)+ @injection.content - . - (import_declaration - (import_spec path: (interpreted_string_literal) @_import_c)) - (#eq? @_import_c "\"C\"") - (#match? @injection.content "^//")) -``` - -##### any-of?, not-any-of? - -The "any-of?" predicate allows you to match a capture against multiple strings, -and will match if the capture's text is equal to any of the strings. - -Consider this example that targets JavaScript: - -```scheme -((identifier) @variable.builtin - (#any-of? @variable.builtin - "arguments" - "module" - "console" - "window" - "document")) -``` - -This will match any of the builtin variables in JavaScript. - -_Note_ — Predicates are not handled directly by the Tree-sitter C library. -They are just exposed in a structured form so that higher-level code can perform -the filtering. However, higher-level bindings to Tree-sitter like -[the Rust Crate](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) -or the [WebAssembly binding](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) -do implement a few common predicates like the `#eq?`, `#match?`, and `#any-of?` -predicates explained above. - -To recap about the predicates Tree-Sitter's bindings support: - -- `#eq?` checks for a direct match against a capture or string -- `#match?` checks for a match against a regular expression -- `#any-of?` checks for a match against a list of strings -- Adding `not-` to the beginning of any of these predicates will negate the match -- By default, a quantified capture will only match if _all_ of the nodes match the predicate -- Adding `any-` before the `eq` or `match` predicates will instead match if any of the nodes match the predicate - - -### The Query API - -Create a query by specifying a string containing one or more patterns: - -```c -TSQuery *ts_query_new( - const TSLanguage *language, - const char *source, - uint32_t source_len, - uint32_t *error_offset, - TSQueryError *error_type -); -``` - -If there is an error in the query, then the `error_offset` argument will be set to the byte offset of the error, and the `error_type` argument will be set to a value that indicates the type of error: - -```c -typedef enum { - TSQueryErrorNone = 0, - TSQueryErrorSyntax, - TSQueryErrorNodeType, - TSQueryErrorField, - TSQueryErrorCapture, -} TSQueryError; -``` - -The `TSQuery` value is immutable and can be safely shared between threads. To execute the query, create a `TSQueryCursor`, which carries the state needed for processing the queries. The query cursor should not be shared between threads, but can be reused for many query executions. - -```c -TSQueryCursor *ts_query_cursor_new(void); -``` - -You can then execute the query on a given syntax node: - -```c -void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode); -``` - -You can then iterate over the matches: - -```c -typedef struct { - TSNode node; - uint32_t index; -} TSQueryCapture; - -typedef struct { - uint32_t id; - uint16_t pattern_index; - uint16_t capture_count; - const TSQueryCapture *captures; -} TSQueryMatch; - -bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match); -``` - -This function will return `false` when there are no more matches. Otherwise, it will populate the `match` with data about which pattern matched and which nodes were captured. - -## Static Node Types - -In languages with static typing, it can be helpful for syntax trees to provide specific type information about individual syntax nodes. Tree-sitter makes this information available via a generated file called `node-types.json`. This _node types_ file provides structured data about every possible syntax node in a grammar. - -You can use this data to generate type declarations in statically-typed programming languages. For example, GitHub's [Semantic](https://github.com/github/semantic) uses these node types files to [generate Haskell data types](https://github.com/github/semantic/tree/master/semantic-ast) for every possible syntax node, which allows for code analysis algorithms to be structurally verified by the Haskell type system. - -The node types file contains an array of objects, each of which describes a particular type of syntax node using the following entries: - -#### Basic Info - -Every object in this array has these two entries: - -- `"type"` - A string that indicates which grammar rule the node represents. This corresponds to the `ts_node_type` function described [above](#syntax-nodes). -- `"named"` - A boolean that indicates whether this kind of node corresponds to a rule name in the grammar or just a string literal. See [above](#named-vs-anonymous-nodes) for more info. - -Examples: - -```json -{ - "type": "string_literal", - "named": true -} -{ - "type": "+", - "named": false -} -``` - -Together, these two fields constitute a unique identifier for a node type; no two top-level objects in the `node-types.json` should have the same values for both `"type"` and `"named"`. - -#### Internal Nodes - -Many syntax nodes can have _children_. The node type object describes the possible children that a node can have using the following entries: - -- `"fields"` - An object that describes the possible [fields](#node-field-names) that the node can have. The keys of this object are field names, and the values are _child type_ objects, described below. -- `"children"` - Another _child type_ object that describes all of the node's possible _named_ children _without_ fields. - -A _child type_ object describes a set of child nodes using the following entries: - -- `"required"` - A boolean indicating whether there is always _at least one_ node in this set. -- `"multiple"` - A boolean indicating whether there can be _multiple_ nodes in this set. -- `"types"`- An array of objects that represent the possible types of nodes in this set. Each object has two keys: `"type"` and `"named"`, whose meanings are described above. - -Example with fields: - -```json -{ - "type": "method_definition", - "named": true, - "fields": { - "body": { - "multiple": false, - "required": true, - "types": [{ "type": "statement_block", "named": true }] - }, - "decorator": { - "multiple": true, - "required": false, - "types": [{ "type": "decorator", "named": true }] - }, - "name": { - "multiple": false, - "required": true, - "types": [ - { "type": "computed_property_name", "named": true }, - { "type": "property_identifier", "named": true } - ] - }, - "parameters": { - "multiple": false, - "required": true, - "types": [{ "type": "formal_parameters", "named": true }] - } - } -} -``` - -Example with children: - -```json -{ - "type": "array", - "named": true, - "fields": {}, - "children": { - "multiple": true, - "required": false, - "types": [ - { "type": "_expression", "named": true }, - { "type": "spread_element", "named": true } - ] - } -} -``` - -#### Supertype Nodes - -In Tree-sitter grammars, there are usually certain rules that represent abstract _categories_ of syntax nodes (e.g. "expression", "type", "declaration"). In the `grammar.js` file, these are often written as [hidden rules](./creating-parsers#hiding-rules) whose definition is a simple [`choice`](./creating-parsers#the-grammar-dsl) where each member is just a single symbol. - -Normally, hidden rules are not mentioned in the node types file, since they don't appear in the syntax tree. But if you add a hidden rule to the grammar's [`supertypes` list](./creating-parsers#the-grammar-dsl), then it _will_ show up in the node types file, with the following special entry: - -- `"subtypes"` - An array of objects that specify the _types_ of nodes that this 'supertype' node can wrap. - -Example: - -```json -{ - "type": "_declaration", - "named": true, - "subtypes": [ - { "type": "class_declaration", "named": true }, - { "type": "function_declaration", "named": true }, - { "type": "generator_function_declaration", "named": true }, - { "type": "lexical_declaration", "named": true }, - { "type": "variable_declaration", "named": true } - ] -} -``` - -Supertype nodes will also appear elsewhere in the node types file, as children of other node types, in a way that corresponds with how the supertype rule was used in the grammar. This can make the node types much shorter and easier to read, because a single supertype will take the place of multiple subtypes. - -Example: - -```json -{ - "type": "export_statement", - "named": true, - "fields": { - "declaration": { - "multiple": false, - "required": false, - "types": [{ "type": "_declaration", "named": true }] - }, - "source": { - "multiple": false, - "required": false, - "types": [{ "type": "string", "named": true }] - } - } -} -``` diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md deleted file mode 100644 index f2fd43cb..00000000 --- a/docs/section-3-creating-parsers.md +++ /dev/null @@ -1,1127 +0,0 @@ ---- -title: Creating Parsers -permalink: creating-parsers ---- - -# Creating parsers - -Developing Tree-sitter grammars can have a difficult learning curve, but once you get the hang of it, it can be fun and even zen-like. This document will help you to get started and to develop a useful mental model. - -## Getting Started - -### Dependencies - -In order to develop a Tree-sitter parser, there are two dependencies that you need to install: - -* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. You'll need Node.js version 6.0 or greater. -* **A C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform. - -### Installation - -To create a Tree-sitter parser, you need to use [the `tree-sitter` CLI][tree-sitter-cli]. You can install the CLI in a few different ways: - -* Build the `tree-sitter-cli` [Rust crate][crate] from source using [`cargo`][cargo], the Rust package manager. This works on any platform. See [the contributing docs](./contributing#developing-tree-sitter) for more information. -* Install the `tree-sitter-cli` [Node.js module][node-module] using [`npm`][npm], the Node package manager. This approach is fast, but is only works on certain platforms, because it relies on pre-built binaries. -* Download a binary for your platform from [the latest GitHub release][releases], and put it into a directory on your `PATH`. - -### Project Setup - -The preferred convention is to name the parser repository "tree-sitter-" followed by the name of the language. - -```sh -mkdir tree-sitter-${YOUR_LANGUAGE_NAME} -cd tree-sitter-${YOUR_LANGUAGE_NAME} -``` - -You can use the `tree-sitter` CLI tool to set up your project, and allows your parser to be used from multiple languages. - -```sh -# This will prompt you for input -tree-sitter init -``` - -Once you have installed the CLI and run through the `init` command's prompts, a file called `grammar.js` should exist with the following contents: - -```js -/// -// @ts-check - -module.exports = grammar({ - name: 'YOUR_LANGUAGE_NAME', - - rules: { - // TODO: add the actual grammar rules - source_file: $ => 'hello' - } -}); -``` - -Now, run the following command: - -```sh -tree-sitter generate -``` - -This will generate the C code required to parse this trivial language, as well as a few files that are needed to compile and load this native parser as a Node.js module. - -You can test this parser by creating a source file with the contents "hello" and parsing it: - -```sh -echo 'hello' > example-file -tree-sitter parse example-file -``` - -Alternatively, in Windows PowerShell: - -```pwsh -"hello" | Out-File example-file -Encoding utf8 -tree-sitter parse example-file -``` - -This should print the following: - -```text -(source_file [0, 0] - [1, 0]) -``` - -You now have a working parser. - -Finally, look back at the [triple-slash][] and [`@ts-check`][ts-check] comments in `grammar.js`; these tell your editor to provide documentation and type information as you edit your grammar. For these to work, you must download Tree-sitter's TypeScript API from npm into a `node_modules` directory in your project: - -```sh -npm install -``` - -## Tool Overview - -Let's go over all of the functionality of the `tree-sitter` command line tool. - -### Command: `init` - -The first command you will likely run is the `init` command. This command sets up an empty repository with everything you need to get going with a grammar repository. -It only has one optional argument, `--update`, which will update outdated generated files, if needed. - -The main file of interest for users to configure is `tree-sitter.json`, which tells the CLI information about your grammar, such as the queries. - -#### Structure of `tree-sitter.json` - -##### The `grammars` field - -This field is an array of objects, you typically only need one object in this array, unless your repo has multiple grammars (e.g. like `Typescript` and `TSX`) - -###### Basics - -These keys specify basic information about the parser: - -* `scope` (required) - A string like `"source.js"` that identifies the language. Currently, we strive to match the scope names used by popular [TextMate grammars](https://macromates.com/manual/en/language_grammars) and by the [Linguist](https://github.com/github/linguist) library. - -* `path` - A relative path from the directory containing `tree-sitter.json` to another directory containing the `src/` folder, which contains the actual generated parser. The default value is `"."` (so that `src/` is in the same folder as `tree-sitter.json`), and this very rarely needs to be overridden. - -* `external-files` - A list of relative paths from the root dir of a -parser to files that should be checked for modifications during recompilation. -This is useful during development to have changes to other files besides scanner.c -be picked up by the cli. - -###### Language Detection - -These keys help to decide whether the language applies to a given file: - -* `file-types` - An array of filename suffix strings. The grammar will be used for files whose names end with one of these suffixes. Note that the suffix may match an *entire* filename. - -* `first-line-regex` - A regex pattern that will be tested against the first line of a file in order to determine whether this language applies to the file. If present, this regex will be used for any file whose language does not match any grammar's `file-types`. - -* `content-regex` - A regex pattern that will be tested against the contents of the file in order to break ties in cases where multiple grammars matched the file using the above two criteria. If the regex matches, this grammar will be preferred over another grammar with no `content-regex`. If the regex does not match, a grammar with no `content-regex` will be preferred over this one. - -* `injection-regex` - A regex pattern that will be tested against a *language name* in order to determine whether this language should be used for a potential *language injection* site. Language injection is described in more detail in [a later section](#language-injection). - -###### Query Paths - -These keys specify relative paths from the directory containing `tree-sitter.json` to the files that control syntax highlighting: - -* `highlights` - Path to a *highlight query*. Default: `queries/highlights.scm` -* `locals` - Path to a *local variable query*. Default: `queries/locals.scm`. -* `injections` - Path to an *injection query*. Default: `queries/injections.scm`. -* `tags` - Path to an *tag query*. Default: `queries/tags.scm`. - -The behaviors of these three files are described in the next section. - -##### The `metadata` field - -This field contains information that tree-sitter will use to populate relevant bindings' files, especially their versions. A future -`bump-version` and `publish` subcommand will leverage this version information as well. Typically, this will all be set up when you -run `tree-sitter init`, but you are welcome to update it as you see fit. - -* `version` (required) - The current version of your grammar, which should follow [semver](https://semver.org) -* `license` - The license of your grammar, which should be a valid [SPDX license](https://spdx.org/licenses) -* `description` - The brief description of your grammar -* `authors` (required) - An array of objects that contain a `name` field, and optionally an `email` and `url` field. Each field is a string -* `links` - An object that contains a `repository` field, and optionally a `homepage` field. Each field is a string -* `namespace` - The namespace for the `Java` and `Kotlin` bindings, defaults to `io.github.tree-sitter` if not provided - -##### The `bindings` field - -This field controls what bindings are generated when the `init` command is run. Each key is a language name, and the value is a boolean. - -* `c` (default: `true`) -* `go` (default: `true`) -* `java` (default: `false`) -* `kotlin` (default: `false`) -* `node` (default: `true`) -* `python` (default: `true`) -* `rust` (default: `true`) -* `swift` (default: `false`) - -### Command: `version` - -The `version` command prints the version of the `tree-sitter` CLI tool that you have installed. - -```sh -tree-sitter version 1.0.0 -``` - -The only argument is the version itself, which is the first positional argument. -This will update the version in several files, if they exist: - -* tree-sitter.json -* Cargo.toml -* package.json -* Makefile -* CMakeLists.txt -* pyproject.toml - -As a grammar author, you should keep the version of your grammar in sync across -different bindings. However, doing so manually is error-prone and tedious, so -this command takes care of the burden. - -### Command: `generate` - -The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, just run `tree-sitter generate` again. - -The first time you run `tree-sitter generate`, it will also generate a few other files for bindings for the following languages: - -#### C/C++ - -* `Makefile` - This file tells `make` how to compile your language. -* `bindings/c/tree-sitter-language.h` - This file provides the C interface of your language. -* `bindings/c/tree-sitter-language.pc` - This file provides pkg-config metadata about your language's C library. -* `src/tree_sitter/parser.h` - This file provides some basic C definitions that are used in your generated `parser.c` file. -* `src/tree_sitter/alloc.h` - This file provides some memory allocation macros that are to be used in your external scanner, if you have one. -* `src/tree_sitter/array.h` - This file provides some array macros that are to be used in your external scanner, if you have one. - -#### Go - -* `bindings/go/binding.go` - This file wraps your language in a Go module. -* `bindings/go/binding_test.go` - This file contains a test for the Go package. - -#### Node - -* `binding.gyp` - This file tells Node.js how to compile your language. -* `bindings/node/index.js` - This is the file that Node.js initially loads when using your language. -* `bindings/node/binding.cc` - This file wraps your language in a JavaScript module for Node.js. - -#### Python - -* `pyproject.toml` - This file is the manifest of the Python package. -* `setup.py` - This file tells Python how to compile your language. -* `bindings/python/binding.c` - This file wraps your language in a Python module. -* `bindings/python/tree_sitter_language/__init__.py` - This file tells Python how to load your language. -* `bindings/python/tree_sitter_language/__init__.pyi` - This file provides type hints for your parser when used in Python. -* `bindings/python/tree_sitter_language/py.typed` - This file provides type hints for your parser when used in Python. - -#### Rust - -* `Cargo.toml` - This file is the manifest of the Rust package. -* `bindings/rust/lib.rs` - This file wraps your language in a Rust crate when used in Rust. -* `bindings/rust/build.rs` - This file wraps the building process for the Rust crate. - -#### Swift - -* `Package.swift` - This file tells Swift how to compile your language. -* `bindings/swift/TreeSitterLanguage/language.h` - This file wraps your language in a Swift module when used in Swift. - -If there is an ambiguity or *local ambiguity* in your grammar, Tree-sitter will detect it during parser generation, and it will exit with a `Unresolved conflict` error message. See below for more information on these errors. - -### Command: `build` - -The `build` command compiles your parser into a dynamically-loadable library, either as a shared object (`.so`, `.dylib`, or `.dll`) or as a WASM module. - -You can change the compiler executable via the `CC` environment variable and add extra flags via `CFLAGS`. For macOS or iOS, you can set `MACOSX_DEPLOYMENT_TARGET` or `IPHONEOS_DEPLOYMENT_TARGET` respectively to define the minimum supported version. - -You can specify whether to compile it as a wasm module with the `--wasm`/`-w` flag, and you can opt to use docker or podman to supply emscripten with the `--docker`/`-d` flag. This removes the need to install emscripten on your machine locally. - -You can specify where to output the shared object file (native or WASM) with the `--output`/`-o` flag, which accepts either an absolute path or relative path. Note that if you don't supply this flag, the CLI will attempt to figure out what the language name is based on the parent directory (so building in `tree-sitter-javascript` will resolve to `javascript`) to use for the output file. If it can't figure it out, it will default to `parser`, thus generating `parser.so` or `parser.wasm` in the current working directory. - -Lastly, you can also specify a path to the actual grammar directory, in case you are not currently in one. This is done by providing a path as the first *positional* argument. - -Example: - -```sh -tree-sitter build --wasm --output ./build/parser.wasm tree-sitter-javascript -``` - -Notice how the `tree-sitter-javascript` argument is the first positional argument. - -### Command: `test` - -The `tree-sitter test` command allows you to easily test that your parser is working correctly. - -For each rule that you add to the grammar, you should first create a *test* that describes how the syntax trees should look when parsing that rule. These tests are written using specially-formatted text files in the `test/corpus/` directory within your parser's root folder. - -For example, you might have a file called `test/corpus/statements.txt` that contains a series of entries like this: - -```text -================== -Return statements -================== - -func x() int { - return 1; -} - ---- - -(source_file - (function_definition - (identifier) - (parameter_list) - (primitive_type) - (block - (return_statement (number))))) -``` - -* The **name** of each test is written between two lines containing only `=` (equal sign) characters. -* Then the **input source code** is written, followed by a line containing three or more `-` (dash) characters. -* Then, the **expected output syntax tree** is written as an [S-expression][s-exp]. The exact placement of whitespace in the S-expression doesn't matter, but ideally the syntax tree should be legible. Note that the S-expression does not show syntax nodes like `func`, `(` and `;`, which are expressed as strings and regexes in the grammar. It only shows the *named* nodes, as described in [this section][named-vs-anonymous-nodes-section] of the page on parser usage. - - The expected output section can also *optionally* show the [*field names*][field-names-section] associated with each child node. To include field names in your tests, you write a node's field name followed by a colon, before the node itself in the S-expression: - -```text -(source_file - (function_definition - name: (identifier) - parameters: (parameter_list) - result: (primitive_type) - body: (block - (return_statement (number))))) -``` - -* If your language's syntax conflicts with the `===` and `---` test separators, you can optionally add an arbitrary identical suffix (in the below example, `|||`) to disambiguate them: - -```text -==================||| -Basic module -==================||| - ----- MODULE Test ---- -increment(n) == n + 1 -==== - ----||| - -(source_file - (module (identifier) - (operator (identifier) - (parameter_list (identifier)) - (plus (identifier_ref) (number))))) -``` - -These tests are important. They serve as the parser's API documentation, and they can be run every time you change the grammar to verify that everything still parses correctly. - -By default, the `tree-sitter test` command runs all of the tests in your `test/corpus/` folder. To run a particular test, you can use the `-f` flag: - -```sh -tree-sitter test -f 'Return statements' -``` - -The recommendation is to be comprehensive in adding tests. If it's a visible node, add it to a test file in your `test/corpus` directory. It's typically a good idea to test all of the permutations of each language construct. This increases test coverage, but doubly acquaints readers with a way to examine expected outputs and understand the "edges" of a language. - -#### Attributes - -Tests can be annotated with a few `attributes`. Attributes must be put in the header, below the test name, and start with a `:`. -A couple of attributes also take in a parameter, which require the use of parenthesis. - -**Note**: If you'd like to supply in multiple parameters, e.g. to run tests on multiple platforms or to test multiple languages, you can repeat the attribute on a new line. - -The following attributes are available: - -* `:skip` — This attribute will skip the test when running `tree-sitter test`. - This is useful when you want to temporarily disable running a test without deleting it. -* `:error` — This attribute will assert that the parse tree contains an error. It's useful to just validate that a certain input is invalid without displaying the whole parse tree, as such you should omit the parse tree below the `---` line. -* `:fail-fast` — This attribute will stop the testing additional tests if the test marked with this attribute fails. -* `:language(LANG)` — This attribute will run the tests using the parser for the specified language. This is useful for multi-parser repos, such as XML and DTD, or Typescript and TSX. The default parser used will always be the first entry in the `grammars` field in the `tree-sitter.json` config file, so having a way to pick a second or even third parser is useful. -* `:platform(PLATFORM)` — This attribute specifies the platform on which the test should run. It is useful to test platform-specific behavior (e.g. Windows newlines are different from Unix). This attribute must match up with Rust's [`std::env::consts::OS`](https://doc.rust-lang.org/std/env/consts/constant.OS.html). - -Examples using attributes: - -```text -========================= -Test that will be skipped -:skip -========================= - -int main() {} - -------------------------- - -==================================== -Test that will run on Linux or macOS - -:platform(linux) -:platform(macos) -==================================== - -int main() {} - ------------------------------------- - -======================================================================== -Test that expects an error, and will fail fast if there's no parse error -:fail-fast -:error -======================================================================== - -int main ( {} - ------------------------------------------------------------------------- - -================================================= -Test that will parse with both Typescript and TSX -:language(typescript) -:language(tsx) -================================================= - -console.log('Hello, world!'); - -------------------------------------------------- -``` - -#### Automatic Compilation - -You might notice that the first time you run `tree-sitter test` after regenerating your parser, it takes some extra time. This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. It recompiles your parser as-needed whenever you update it by re-running `tree-sitter generate`. - -#### Syntax Highlighting Tests - -The `tree-sitter test` command will *also* run any syntax highlighting tests in the `test/highlight` folder, if it exists. For more information about syntax highlighting tests, see [the syntax highlighting page][syntax-highlighting-tests]. - -### Command: `parse` - -You can run your parser on an arbitrary file using `tree-sitter parse`. This will print the resulting the syntax tree, including nodes' ranges and field names, like this: - -```text -(source_file [0, 0] - [3, 0] - (function_declaration [0, 0] - [2, 1] - name: (identifier [0, 5] - [0, 9]) - parameters: (parameter_list [0, 9] - [0, 11]) - result: (type_identifier [0, 12] - [0, 15]) - body: (block [0, 16] - [2, 1] - (return_statement [1, 2] - [1, 10] - (expression_list [1, 9] - [1, 10] - (int_literal [1, 9] - [1, 10])))))) -``` - -You can pass any number of file paths and glob patterns to `tree-sitter parse`, and it will parse all of the given files. The command will exit with a non-zero status code if any parse errors occurred. Passing the `--cst` flag will output a pretty-printed CST instead of the normal S-expression representation. You can also prevent the syntax trees from being printed using the `--quiet` flag. Additionally, the `--stat` flag prints out aggregated parse success/failure information for all processed files. This makes `tree-sitter parse` usable as a secondary testing strategy: you can check that a large number of files parse without error: - -```sh -tree-sitter parse 'examples/**/*.go' --quiet --stat -``` - -### Command: `highlight` - -You can run syntax highlighting on an arbitrary file using `tree-sitter highlight`. This can either output colors directly to your terminal using ansi escape codes, or produce HTML (if the `--html` flag is passed). For more information, see [the syntax highlighting page][syntax-highlighting]. - -### The Grammar DSL - -The following is a complete list of built-in functions you can use in your `grammar.js` to define rules. Use-cases for some of these functions will be explained in more detail in later sections. - -* **Symbols (the `$` object)** - Every grammar rule is written as a JavaScript function that takes a parameter conventionally called `$`. The syntax `$.identifier` is how you refer to another grammar symbol within a rule. Names starting with `$.MISSING` or `$.UNEXPECTED` should be avoided as they have special meaning for the `tree-sitter test` command. -* **String and Regex literals** - The terminal symbols in a grammar are described using JavaScript strings and regular expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing regular expressions in your grammar. -* **Regex Limitations** - Currently, only a subset of the Regex engine is actually -supported. This is due to certain features like lookahead and lookaround assertions -not feasible to use in an LR(1) grammar, as well as certain flags being unnecessary -for tree-sitter. However, plenty of features are supported by default: - - * Character classes - * Character ranges - * Character sets - * Quantifiers - * Alternation - * Grouping - * Unicode character escapes - * Unicode property escapes - -* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation][ebnf]. -* **Alternatives : `choice(rule1, rule2, ...)`** - This function creates a rule that matches *one* of a set of possible rules. The order of the arguments does not matter. This is analogous to the `|` (pipe) operator in EBNF notation. -* **Repetitions : `repeat(rule)`** - This function creates a rule that matches *zero-or-more* occurrences of a given rule. It is analogous to the `{x}` (curly brace) syntax in EBNF notation. -* **Repetitions : `repeat1(rule)`** - This function creates a rule that matches *one-or-more* occurrences of a given rule. The previous `repeat` rule is implemented in terms of `repeat1` but is included because it is very commonly used. -* **Options : `optional(rule)`** - This function creates a rule that matches *zero or one* occurrence of a given rule. It is analogous to the `[x]` (square bracket) syntax in EBNF notation. -* **Precedence : `prec(number, rule)`** - This function marks the given rule with a numerical precedence which will be used to resolve [*LR(1) Conflicts*][lr-conflict] at parser-generation time. When two rules overlap in a way that represents either a true ambiguity or a *local* ambiguity given one token of lookahead, Tree-sitter will try to resolve the conflict by matching the rule with the higher precedence. The default precedence of all rules is zero. This works similarly to the [precedence directives][yacc-prec] in Yacc grammars. -* **Left Associativity : `prec.left([number], rule)`** - This function marks the given rule as left-associative (and optionally applies a numerical precedence). When an LR(1) conflict arises in which all of the rules have the same numerical precedence, Tree-sitter will consult the rules' associativity. If there is a left-associative rule, Tree-sitter will prefer matching a rule that ends *earlier*. This works similarly to [associativity directives][yacc-prec] in Yacc grammars. -* **Right Associativity : `prec.right([number], rule)`** - This function is like `prec.left`, but it instructs Tree-sitter to prefer matching a rule that ends *later*. -* **Dynamic Precedence : `prec.dynamic(number, rule)`** - This function is similar to `prec`, but the given numerical precedence is applied at *runtime* instead of at parser generation time. This is only necessary when handling a conflict dynamically using the `conflicts` field in the grammar, and when there is a genuine *ambiguity*: multiple rules correctly match a given piece of code. In that event, Tree-sitter compares the total dynamic precedence associated with each rule, and selects the one with the highest total. This is similar to [dynamic precedence directives][bison-dprec] in Bison grammars. -* **Tokens : `token(rule)`** - This function marks the given rule as producing only -a single token. Tree-sitter's default is to treat each String or RegExp literal -in the grammar as a separate token. Each token is matched separately by the lexer -and returned as its own leaf node in the tree. The `token` function allows you to -express a complex rule using the functions described above (rather than as a single -regular expression) but still have Tree-sitter treat it as a single token. -The token function will only accept terminal rules, so `token($.foo)` will not work. -You can think of it as a shortcut for squashing complex rules of strings or regexes -down to a single token. -* **Immediate Tokens : `token.immediate(rule)`** - Usually, whitespace (and any other extras, such as comments) is optional before each token. This function means that the token will only match if there is no whitespace. -* **Aliases : `alias(rule, name)`** - This function causes the given rule to *appear* with an alternative name in the syntax tree. If `name` is a *symbol*, as in `alias($.foo, $.bar)`, then the aliased rule will *appear* as a [named node][named-vs-anonymous-nodes-section] called `bar`. And if `name` is a *string literal*, as in `alias($.foo, 'bar')`, then the aliased rule will appear as an [anonymous node][named-vs-anonymous-nodes-section], as if the rule had been written as the simple string. -* **Field Names : `field(name, rule)`** - This function assigns a *field name* to the child node(s) matched by the given rule. In the resulting syntax tree, you can then use that field name to access specific children. -* **Reserved Keywords : `reserved(wordset, rule)`** - This function will override the global reserved word set with the one passed into the `wordset` parameter. This is useful for contextual keywords, such as `if` in JavaScript, which cannot be used as a variable name in most contexts, but can be used as a property name. - -In addition to the `name` and `rules` fields, grammars have a few other optional public fields that influence the behavior of the parser. - -* **`extras`** - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar. -* **`inline`** - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime. -* **`conflicts`** - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*. -* **`externals`** - an array of token names which can be returned by an [*external scanner*](#external-scanners). External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. -* **`precedences`** - an array of arrays of strings, where each array of strings defines named precedence levels in descending order. These names can be used in the `prec` functions to define precedence relative only to other names in the array, rather than globally. Can only be used with parse precedence, not lexical precedence. -* **`word`** - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization. -* **`supertypes`** - an array of hidden rule names which should be considered to be 'supertypes' in the generated [*node types* file][static-node-types]. -* **`reserved`** - similar in structure to the main `rules` property, an object of reserved word sets associated with an array of reserved rules. The reserved rule in the array must be a terminal token - meaning it must be a string, regex, or token, or a terminal rule. The *first* reserved word set in the object is the global word set, meaning it applies to every rule in every parse state. However, certain keywords are contextual, depending on the rule. For example, in JavaScript, keywords are typically not allowed as ordinary variables, however, they *can* be used as a property name. In this situation, the `reserved` function would be used, and the word set to pass in would be the name of the word set that is declared in the `reserved` object that coreesponds an empty array, signifying *no* keywords are reserved. - -## Writing the Grammar - -Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe any given language. In order to produce a good Tree-sitter parser, you need to create a grammar with two important properties: - -1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers. - -2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications. - -It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. The following sections will explain these adjustments in more depth. - -### The First Few Rules - -It's usually a good idea to find a formal specification for the language you're trying to parse. This specification will most likely contain a context-free grammar. As you read through the rules of this CFG, you will probably discover a complex and cyclic graph of relationships. It might be unclear how you should navigate this graph as you define your grammar. - -Although languages have very different constructs, their constructs can often be categorized in to similar groups like *Declarations*, *Definitions*, *Statements*, *Expressions*, *Types*, and *Patterns*. In writing your grammar, a good first step is to create just enough structure to include all of these basic *groups* of symbols. For a language like Go, you might start with something like this: - -```js -{ - // ... - - rules: { - source_file: $ => repeat($._definition), - - _definition: $ => choice( - $.function_definition - // TODO: other kinds of definitions - ), - - function_definition: $ => seq( - 'func', - $.identifier, - $.parameter_list, - $._type, - $.block - ), - - parameter_list: $ => seq( - '(', - // TODO: parameters - ')' - ), - - _type: $ => choice( - 'bool' - // TODO: other kinds of types - ), - - block: $ => seq( - '{', - repeat($._statement), - '}' - ), - - _statement: $ => choice( - $.return_statement - // TODO: other kinds of statements - ), - - return_statement: $ => seq( - 'return', - $._expression, - ';' - ), - - _expression: $ => choice( - $.identifier, - $.number - // TODO: other kinds of expressions - ), - - identifier: $ => /[a-z]+/, - - number: $ => /\d+/ - } -} -``` - -One important fact to know up front is that the start rule for the grammar is the first property in the `rules` object. -In the example above, that would correspond to `source_file`, but it can be named anything. - -Some of the details of this grammar will be explained in more depth later on, but if you focus on the `TODO` comments, you can see that the overall strategy is *breadth-first*. Notably, this initial skeleton does not need to directly match an exact subset of the context-free grammar in the language specification. It just needs to touch on the major groupings of rules in as simple and obvious a way as possible. - -With this structure in place, you can now freely decide what part of the grammar to flesh out next. For example, you might decide to start with *types*. One-by-one, you could define the rules for writing basic types and composing them into more complex types: - -```js -{ - // ... - - _type: $ => choice( - $.primitive_type, - $.array_type, - $.pointer_type - ), - - primitive_type: $ => choice( - 'bool', - 'int' - ), - - array_type: $ => seq( - '[', - ']', - $._type - ), - - pointer_type: $ => seq( - '*', - $._type - ) -} -``` - -After developing the *type* sublanguage a bit further, you might decide to switch to working on *statements* or *expressions* instead. It's often useful to check your progress by trying to parse some real code using `tree-sitter parse`. - -**And remember to add tests for each rule in your `test/corpus` folder!** - -### Structuring Rules Well - -Imagine that you were just starting work on the [Tree-sitter JavaScript parser][tree-sitter-javascript]. Naively, you might try to directly mirror the structure of the [ECMAScript Language Spec][ecmascript-spec]. To illustrate the problem with this approach, consider the following line of code: - -```js -return x + y; -``` - -According to the specification, this line is a `ReturnStatement`, the fragment `x + y` is an `AdditiveExpression`, and `x` and `y` are both `IdentifierReferences`. The relationship between these constructs is captured by a complex series of production rules: - -```text -ReturnStatement -> 'return' Expression -Expression -> AssignmentExpression -AssignmentExpression -> ConditionalExpression -ConditionalExpression -> LogicalORExpression -LogicalORExpression -> LogicalANDExpression -LogicalANDExpression -> BitwiseORExpression -BitwiseORExpression -> BitwiseXORExpression -BitwiseXORExpression -> BitwiseANDExpression -BitwiseANDExpression -> EqualityExpression -EqualityExpression -> RelationalExpression -RelationalExpression -> ShiftExpression -ShiftExpression -> AdditiveExpression -AdditiveExpression -> MultiplicativeExpression -MultiplicativeExpression -> ExponentiationExpression -ExponentiationExpression -> UnaryExpression -UnaryExpression -> UpdateExpression -UpdateExpression -> LeftHandSideExpression -LeftHandSideExpression -> NewExpression -NewExpression -> MemberExpression -MemberExpression -> PrimaryExpression -PrimaryExpression -> IdentifierReference -``` - -The language spec encodes the twenty different precedence levels of JavaScript expressions using twenty levels of indirection between `IdentifierReference` and `Expression`. If we were to create a concrete syntax tree representing this statement according to the language spec, it would have twenty levels of nesting, and it would contain nodes with names like `BitwiseXORExpression`, which are unrelated to the actual code. - -### Using Precedence - -To produce a readable syntax tree, we'd like to model JavaScript expressions using a much flatter structure like this: - -```js -{ - // ... - - _expression: $ => choice( - $.identifier, - $.unary_expression, - $.binary_expression, - // ... - ), - - unary_expression: $ => choice( - seq('-', $._expression), - seq('!', $._expression), - // ... - ), - - binary_expression: $ => choice( - seq($._expression, '*', $._expression), - seq($._expression, '+', $._expression), - // ... - ), -} -``` - -Of course, this flat structure is highly ambiguous. If we try to generate a parser, Tree-sitter gives us an error message: - -```text -Error: Unresolved conflict for symbol sequence: - - '-' _expression • '*' … - -Possible interpretations: - - 1: '-' (binary_expression _expression • '*' _expression) - 2: (unary_expression '-' _expression) • '*' … - -Possible resolutions: - - 1: Specify a higher precedence in `binary_expression` than in the other rules. - 2: Specify a higher precedence in `unary_expression` than in the other rules. - 3: Specify a left or right associativity in `unary_expression` - 4: Add a conflict for these rules: `binary_expression` `unary_expression` -``` - -Note: The • character in the error message indicates where exactly during -parsing the conflict occurs, or in other words, where the parser is encountering -ambiguity. - -For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This is where the `prec` function [described above](#the-grammar-dsl) comes into play. By wrapping a rule with `prec`, we can indicate that certain sequence of symbols should *bind to each other more tightly* than others. For example, the `'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` sequence in `binary_expression`: - -```js -{ - // ... - - unary_expression: $ => prec(2, choice( - seq('-', $._expression), - seq('!', $._expression), - // ... - )) -} -``` - -### Using Associativity - -Applying a higher precedence in `unary_expression` fixes that conflict, but there is still another conflict: - -```text -Error: Unresolved conflict for symbol sequence: - - _expression '*' _expression • '*' … - -Possible interpretations: - - 1: _expression '*' (binary_expression _expression • '*' _expression) - 2: (binary_expression _expression '*' _expression) • '*' … - -Possible resolutions: - - 1: Specify a left or right associativity in `binary_expression` - 2: Add a conflict for these rules: `binary_expression` -``` - -For an expression like `a * b * c`, it's not clear whether we mean `a * (b * c)` or `(a * b) * c`. This is where `prec.left` and `prec.right` come into use. We want to select the second interpretation, so we use `prec.left`. - -```js -{ - // ... - - binary_expression: $ => choice( - prec.left(2, seq($._expression, '*', $._expression)), - prec.left(1, seq($._expression, '+', $._expression)), - // ... - ), -} -``` - -### Hiding Rules - -You may have noticed in the above examples that some of the grammar rule name like `_expression` and `_type` began with an underscore. Starting a rule's name with an underscore causes the rule to be *hidden* in the syntax tree. This is useful for rules like `_expression` in the grammars above, which always just wrap a single child node. If these nodes were not hidden, they would add substantial depth and noise to the syntax tree without making it any easier to understand. - -### Using Fields - -Often, it's easier to analyze a syntax node if you can refer to its children by *name* instead of by their position in an ordered list. Tree-sitter grammars support this using the `field` function. This function allows you to assign unique names to some or all of a node's children: - -```js -function_definition: $ => seq( - 'func', - field('name', $.identifier), - field('parameters', $.parameter_list), - field('return_type', $._type), - field('body', $.block) -) -``` - -Adding fields like this allows you to retrieve nodes using the [field APIs][field-names-section]. - -## Lexical Analysis - -Tree-sitter's parsing process is divided into two phases: parsing (which is described above) and [lexing][lexing] - the process of grouping individual characters into the language's fundamental *tokens*. There are a few important things to know about how Tree-sitter's lexing works. - -### Conflicting Tokens - -Grammars often contain multiple tokens that can match the same characters. For example, a grammar might contain the tokens (`"if"` and `/[a-z]+/`). Tree-sitter differentiates between these conflicting tokens in a few ways. - -1. **Context-aware Lexing** - Tree-sitter performs lexing on-demand, during the parsing process. At any given position in a source document, the lexer only tries to recognize tokens that are *valid* at that position in the document. - -2. **Lexical Precedence** - When the precedence functions described [above](#the-grammar-dsl) are used *within* the `token` function, the given explicit precedence values serve as instructions to the lexer. If there are two valid tokens that match the characters at a given position in the document, Tree-sitter will select the one with the higher precedence. - -3. **Match Length** - If multiple valid tokens with the same precedence match the characters at a given position in a document, Tree-sitter will select the token that matches the [longest sequence of characters][longest-match]. - -4. **Match Specificity** - If there are two valid tokens with the same precedence and which both match the same number of characters, Tree-sitter will prefer a token that is specified in the grammar as a `String` over a token specified as a `RegExp`. - -5. **Rule Order** - If none of the above criteria can be used to select one token over another, Tree-sitter will prefer the token that appears earlier in the grammar. - -If there is an external scanner it may have [an additional impact](#other-external-scanner-details) over regular tokens defined in the grammar. - -### Lexical Precedence vs. Parse Precedence - -One common mistake involves not distinguishing *lexical precedence* from *parse precedence*. Parse precedence determines which rule is chosen to interpret a given sequence of tokens. *Lexical precedence* determines which token is chosen to interpret at a given position of text and it is a lower-level operation that is done first. The above list fully captures Tree-sitter's lexical precedence rules, and you will probably refer back to this section of the documentation more often than any other. Most of the time when you really get stuck, you're dealing with a lexical precedence problem. Pay particular attention to the difference in meaning between using `prec` inside of the `token` function versus outside of it. The *lexical precedence* syntax is `token(prec(N, ...))`. - -### Keywords - -Many languages have a set of *keyword* tokens (e.g. `if`, `for`, `return`), as well as a more general token (e.g. `identifier`) that matches any word, including many of the keyword strings. For example, JavaScript has a keyword `instanceof`, which is used as a binary operator, like this: - -```js -if (a instanceof Something) b(); -``` - -The following, however, is not valid JavaScript: - -```js -if (a instanceofSomething) b(); -``` - -A keyword like `instanceof` cannot be followed immediately by another letter, because then it would be tokenized as an `identifier`, **even though an identifier is not valid at that position**. Because Tree-sitter uses context-aware lexing, as described [above](#conflicting-tokens), it would not normally impose this restriction. By default, Tree-sitter would recognize `instanceofSomething` as two separate tokens: the `instanceof` keyword followed by an `identifier`. - -### Keyword Extraction - -Fortunately, Tree-sitter has a feature that allows you to fix this, so that you can match the behavior of other standard parsers: the `word` token. If you specify a `word` token in your grammar, Tree-sitter will find the set of *keyword* tokens that match strings also matched by the `word` token. Then, during lexing, instead of matching each of these keywords individually, Tree-sitter will match the keywords via a two-step process where it *first* matches the `word` token. - -For example, suppose we added `identifier` as the `word` token in our JavaScript grammar: - -```js -grammar({ - name: 'javascript', - - word: $ => $.identifier, - - rules: { - _expression: $ => choice( - $.identifier, - $.unary_expression, - $.binary_expression - // ... - ), - - binary_expression: $ => choice( - prec.left(1, seq($._expression, 'instanceof', $._expression)) - // ... - ), - - unary_expression: $ => choice( - prec.left(2, seq('typeof', $._expression)) - // ... - ), - - identifier: $ => /[a-z_]+/ - } -}); -``` - -Tree-sitter would identify `typeof` and `instanceof` as keywords. Then, when parsing the invalid code above, rather than scanning for the `instanceof` token individually, it would scan for an `identifier` first, and find `instanceofSomething`. It would then correctly recognize the code as invalid. - -Aside from improving error detection, keyword extraction also has performance benefits. It allows Tree-sitter to generate a smaller, simpler lexing function, which means that **the parser will compile much more quickly**. - -### External Scanners - -Many languages have some tokens whose structure is impossible or inconvenient to describe with a regular expression. Some examples: - -* [Indent and dedent][indent-tokens] tokens in Python -* [Heredocs][heredoc] in Bash and Ruby -* [Percent strings][percent-string] in Ruby - -Tree-sitter allows you to handle these kinds of tokens using *external scanners*. An external scanner is a set of C functions that you, the grammar author, can write by hand in order to add custom logic for recognizing certain tokens. - -To use an external scanner, there are a few steps. First, add an `externals` section to your grammar. This section should list the names of all of your external tokens. These names can then be used elsewhere in your grammar. - -```js -grammar({ - name: 'my_language', - - externals: $ => [ - $.indent, - $.dedent, - $.newline - ], - - // ... -}); -``` - -Then, add another C source file to your project. Currently, its path must be `src/scanner.c` for the CLI to recognize it. Be sure to add this file to the `sources` section of your `binding.gyp` file so that it will be included when your project is compiled by Node.js and uncomment the appropriate block in your `bindings/rust/build.rs` file so that it will be included in your Rust crate. - -In this new source file, define an [`enum`][enum] type containing the names of all of your external tokens. The ordering of this enum must match the order in your grammar's `externals` array; the actual names do not matter. - -```c -#include "tree_sitter/parser.h" -#include "tree_sitter/alloc.h" -#include "tree_sitter/array.h" - -enum TokenType { - INDENT, - DEDENT, - NEWLINE -} -``` - -Finally, you must define five functions with specific names, based on your language's name and five actions: *create*, *destroy*, *serialize*, *deserialize*, and *scan*. - -#### Create - -```c -void *tree_sitter_my_language_external_scanner_create(void) { - // ... -} -``` - -This function should create your scanner object. It will only be called once anytime your language is set on a parser. Often, you will want to allocate memory on the heap and return a pointer to it. If your external scanner doesn't need to maintain any state, it's ok to return `NULL`. - -#### Destroy - -```c -void tree_sitter_my_language_external_scanner_destroy(void *payload) { - // ... -} -``` - -This function should free any memory used by your scanner. It is called once when a parser is deleted or assigned a different language. It receives as an argument the same pointer that was returned from the *create* function. If your *create* function didn't allocate any memory, this function can be a noop. - -#### Serialize - -```c -unsigned tree_sitter_my_language_external_scanner_serialize( - void *payload, - char *buffer -) { - // ... -} -``` - -This function should copy the complete state of your scanner into a given byte buffer, and return the number of bytes written. The function is called every time the external scanner successfully recognizes a token. It receives a pointer to your scanner and a pointer to a buffer. The maximum number of bytes that you can write is given by the `TREE_SITTER_SERIALIZATION_BUFFER_SIZE` constant, defined in the `tree_sitter/parser.h` header file. - -The data that this function writes will ultimately be stored in the syntax tree so that the scanner can be restored to the right state when handling edits or ambiguities. For your parser to work correctly, the `serialize` function must store its entire state, and `deserialize` must restore the entire state. For good performance, you should design your scanner so that its state can be serialized as quickly and compactly as possible. - -#### Deserialize - -```c -void tree_sitter_my_language_external_scanner_deserialize( - void *payload, - const char *buffer, - unsigned length -) { - // ... -} -``` - -This function should *restore* the state of your scanner based the bytes that were previously written by the `serialize` function. It is called with a pointer to your scanner, a pointer to the buffer of bytes, and the number of bytes that should be read. -It is good practice to explicitly erase your scanner state variables at the start of this function, before restoring their values from the byte buffer. - -#### Scan - -```c -bool tree_sitter_my_language_external_scanner_scan( - void *payload, - TSLexer *lexer, - const bool *valid_symbols -) { - // ... -} -``` - -This function is responsible for recognizing external tokens. It should return `true` if a token was recognized, and `false` otherwise. It is called with a "lexer" struct with the following fields: - -* **`int32_t lookahead`** - The current next character in the input stream, represented as a 32-bit unicode code point. -* **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. -* **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace; whitespace won't be included in the text range associated with tokens emitted by the external scanner. -* **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. -* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of codepoints since the start of the current line. The codepoint position is recalculated on every call to this function by reading from the start of the line. -* **`bool (*is_at_included_range_start)(const TSLexer *)`** - A function for checking whether the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), the scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. -* **`bool (*eof)(const TSLexer *)`** - A function for determining whether the lexer is at the end of the file. The value of `lookahead` will be `0` at the end of a file, but this function should be used instead of checking for that value because the `0` or "NUL" value is also a valid character that could be present in the file being parsed. -* **`void (*log)(const TSLexer *, const char * format, ...)`** - A `printf`-like function for logging. The log is viewable through e.g. `tree-sitter parse --debug` or the browser's console after checking the `log` option in the [Playground](./playground). - -The third argument to the `scan` function is an array of booleans that indicates which of external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic. - -```c -if (valid_symbols[INDENT] || valid_symbols[DEDENT]) { - - // ... logic that is common to both `INDENT` and `DEDENT` - - if (valid_symbols[INDENT]) { - - // ... logic that is specific to `INDENT` - - lexer->result_symbol = INDENT; - return true; - } -} -``` - -#### External Scanner Helpers - -##### Allocator - -Instead of using libc's `malloc`, `calloc`, `realloc`, and `free`, you should use the versions prefixed with `ts_` from `tree_sitter/alloc.h`. -These macros can allow a potential consumer to override the default allocator with their own implementation, but by default will use the libc functions. - -As a consumer of the tree-sitter core library as well as any parser libraries that might use allocations, you can enable overriding the default allocator and have it use the same one as the library allocator, of which you can set with `ts_set_allocator`. -To enable this overriding in scanners, you must compile them with the `TREE_SITTER_REUSE_ALLOCATOR` macro defined, and tree-sitter the library must be linked into your final app dynamically, since it needs to resolve the internal functions at runtime. If you are compiling -an executable binary that uses the core library, but want to load parsers dynamically at runtime, then you will have to use a special linker flag on Unix. For non-Darwin systems, that would be `--dynamic-list` and for Darwin systems, that would be `-exported_symbols_list`. -The CLI does exactly this, so you can use it as a reference (check out `cli/build.rs`). - -For example, assuming you wanted to allocate 100 bytes for your scanner, you'd do so like the following example: - -```c -#include "tree_sitter/parser.h" -#include "tree_sitter/alloc.h" - -// ... - -void *tree_sitter_my_language_external_scanner_create(void) { - return ts_calloc(100, 1); // or ts_malloc(100) -} - -// ... - -``` - -##### Arrays - -If you need to use array-like types in your scanner, such as tracking a stack of indentations or tags, you should use the array macros from `tree_sitter/array.h`. - -There are quite a few of them provided for you, but here's how you could get started tracking some . Check out the header itself for more detailed documentation. - -**NOTE**: Do not use any of the array functions or macros that are prefixed with an underscore and have comments saying that it is not what you are looking for. -These are internal functions used as helpers by other macros that are public. They are not meant to be used directly, nor are they what you want. - -```c -#include "tree_sitter/parser.h" -#include "tree_sitter/array.h" - -enum TokenType { - INDENT, - DEDENT, - NEWLINE, - STRING, -} - -// Create the array in your create function - -void *tree_sitter_my_language_external_scanner_create(void) { - return ts_calloc(1, sizeof(Array(int))); - - // or if you want to zero out the memory yourself - - Array(int) *stack = ts_malloc(sizeof(Array(int))); - array_init(&stack); - return stack; -} - -bool tree_sitter_my_language_external_scanner_scan( - void *payload, - TSLexer *lexer, - const bool *valid_symbols -) { - Array(int) *stack = payload; - if (valid_symbols[INDENT]) { - array_push(stack, lexer->get_column(lexer)); - lexer->result_symbol = INDENT; - return true; - } - if (valid_symbols[DEDENT]) { - array_pop(stack); // this returns the popped element by value, but we don't need it - lexer->result_symbol = DEDENT; - return true; - } - - // we can also use an array on the stack to keep track of a string - - Array(char) next_string = array_new(); - - if (valid_symbols[STRING] && lexer->lookahead == '"') { - lexer->advance(lexer, false); - while (lexer->lookahead != '"' && lexer->lookahead != '\n' && !lexer->eof(lexer)) { - array_push(&next_string, lexer->lookahead); - lexer->advance(lexer, false); - } - - // assume we have some arbitrary constraint of not having more than 100 characters in a string - if (lexer->lookahead == '"' && next_string.size <= 100) { - lexer->advance(lexer, false); - lexer->result_symbol = STRING; - return true; - } - } - - return false; -} - -``` - -#### Other External Scanner Details - -If a token in the `externals` array is valid at a given position in the parse, the external scanner will be called first before anything else is done. This means the external scanner functions as a powerful override of Tree-sitter's lexing behavior, and can be used to solve problems that can't be cracked with ordinary lexical, parse, or dynamic precedence. - -If a syntax error is encountered during regular parsing, Tree-sitter's first action during error recovery will be to call the external scanner's `scan` function with all tokens marked valid. The scanner should detect this case and handle it appropriately. One simple method of detection is to add an unused token to the end of the `externals` array, for example `externals: $ => [$.token1, $.token2, $.error_sentinel]`, then check whether that token is marked valid to determine whether Tree-sitter is in error correction mode. - -If you put terminal keywords in the `externals` array, for example `externals: $ => ['if', 'then', 'else']`, then any time those terminals are present in the grammar they will be tokenized by the external scanner. It is similar to writing `externals: [$.if_keyword, $.then_keyword, $.else_keyword]` then using `alias($.if_keyword, 'if')` in the grammar. - -If in the `externals` array use literal keywords then lexing works in two steps, the external scanner will be called first and if it sets a resulting token and returns `true` then the token considered as recognized and Tree-sitter moves to a next token. But the external scanner may return `false` and in this case Tree-sitter fallbacks to the internal lexing mechanism. - -In case of some keywords defined in the `externals` array in a rule referencing form like `$.if_keyword` and there is no additional definition of that rule in the grammar rules, e.g., `if_keyword: $ => 'if'` then fallback to the internal lexer isn't possible because Tree-sitter doesn't know the actual keyword and it's fully the external scanner responsibilty to recognize such tokens. - -External scanners are a common cause of infinite loops. -Be very careful when emitting zero-width tokens from your external scanner, and if you consume characters in a loop be sure use the `eof` function to check whether you are at the end of the file. - -[ambiguous-grammar]: https://en.wikipedia.org/wiki/Ambiguous_grammar -[antlr]: https://www.antlr.org -[bison-dprec]: https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html -[bison]: https://en.wikipedia.org/wiki/GNU_bison -[cargo]: https://doc.rust-lang.org/cargo/getting-started/installation.html -[crate]: https://crates.io/crates/tree-sitter-cli -[cst]: https://en.wikipedia.org/wiki/Parse_tree -[ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form -[ecmascript-spec]: https://262.ecma-international.org/6.0/ -[ejs]: https://ejs.co -[enum]: https://en.wikipedia.org/wiki/Enumerated_type#C -[glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser -[heredoc]: https://en.wikipedia.org/wiki/Here_document -[indent-tokens]: https://en.wikipedia.org/wiki/Off-side_rule -[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification -[lexing]: https://en.wikipedia.org/wiki/Lexical_analysis -[longest-match]: https://en.wikipedia.org/wiki/Maximal_munch -[lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables -[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser -[multi-language-section]: ./using-parsers#multi-language-documents -[named-vs-anonymous-nodes-section]: ./using-parsers#named-vs-anonymous-nodes -[field-names-section]: ./using-parsers#node-field-names -[node-module]: https://www.npmjs.com/package/tree-sitter-cli -[node.js]: https://nodejs.org -[static-node-types]: ./using-parsers#static-node-types -[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols -[npm]: https://docs.npmjs.com -[path-env]: https://en.wikipedia.org/wiki/PATH_(variable) -[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar -[percent-string]: https://docs.ruby-lang.org/en/2.5.0/doc/syntax/literals_rdoc.html#label-Percent+Strings -[releases]: https://github.com/tree-sitter/tree-sitter/releases/latest -[s-exp]: https://en.wikipedia.org/wiki/S-expression -[syntax-highlighting]: ./syntax-highlighting -[syntax-highlighting-tests]: ./syntax-highlighting#unit-testing -[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter/tree/master/cli -[tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript -[triple-slash]: https://www.typescriptlang.org/docs/handbook/triple-slash-directives.html -[ts-check]: https://www.typescriptlang.org/docs/handbook/intro-to-js-ts.html -[yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html -[yacc]: https://en.wikipedia.org/wiki/Yacc diff --git a/docs/section-4-syntax-highlighting.md b/docs/section-4-syntax-highlighting.md deleted file mode 100644 index 67c2bc25..00000000 --- a/docs/section-4-syntax-highlighting.md +++ /dev/null @@ -1,493 +0,0 @@ ---- -title: Syntax Highlighting -permalink: syntax-highlighting ---- - -# Syntax Highlighting - -Syntax highlighting is a very common feature in applications that deal with code. Tree-sitter has built-in support for syntax highlighting, via the [`tree-sitter-highlight`](https://github.com/tree-sitter/tree-sitter/tree/master/highlight) library, which is currently used on GitHub.com for highlighting code written in several languages. You can also perform syntax highlighting at the command line using the `tree-sitter highlight` command. - -This document explains how the Tree-sitter syntax highlighting system works, using the command line interface. If you are using `tree-sitter-highlight` library (either from C or from Rust), all of these concepts are still applicable, but the configuration data is provided using in-memory objects, rather than files. - -## Overview - -All of the files needed to highlight a given language are normally included in the same git repository as the Tree-sitter grammar for that language (for example, [`tree-sitter-javascript`](https://github.com/tree-sitter/tree-sitter-javascript), [`tree-sitter-ruby`](https://github.com/tree-sitter/tree-sitter-ruby)). In order to run syntax highlighting from the command-line, three types of files are needed: - -1. Per-user configuration in `~/.config/tree-sitter/config.json` -2. Language configuration in grammar repositories' `tree-sitter.json` files. -3. Tree queries in the grammars repositories' `queries` folders. - -For an example of the language-specific files, see the [`tree-sitter.json` file](https://github.com/tree-sitter/tree-sitter-ruby/blob/master/tree-sitter.json) and [`queries` directory](https://github.com/tree-sitter/tree-sitter-ruby/tree/master/queries) in the `tree-sitter-ruby` repository. The following sections describe the behavior of each file. - -## Per-user Configuration - -The Tree-sitter CLI automatically creates two directories in your home folder. One holds a JSON configuration file, that lets you customize the behavior of the CLI. The other holds any compiled language parsers that you use. - -These directories are created in the "normal" place for your platform: - -* On Linux, `~/.config/tree-sitter` and `~/.cache/tree-sitter` -* On Mac, `~/Library/Application Support/tree-sitter` and `~/Library/Caches/tree-sitter` -* On Windows, `C:\Users\[username]\AppData\Roaming\tree-sitter` and `C:\Users\[username]\AppData\Local\tree-sitter` - -The CLI will work if there's no config file present, falling back on default values for each configuration option. To create a config file that you can edit, run this command: - -```sh -tree-sitter init-config -``` - -(This will print out the location of the file that it creates so that you can easily find and modify it.) - -### Paths - -The `tree-sitter highlight` command takes one or more file paths, and tries to automatically determine which language should be used to highlight those files. In order to do this, it needs to know *where* to look for Tree-sitter grammars on your filesystem. You can control this using the `"parser-directories"` key in your configuration file: - -```json -{ - "parser-directories": [ - "/Users/my-name/code", - "/Users/my-name/other-code" - ] -} -``` - -Currently, any folder within one of these *parser directories* whose name begins with `tree-sitter-` will be treated as a Tree-sitter grammar repository. - -### Theme - -The Tree-sitter highlighting system works by annotating ranges of source code with logical "highlight names" like `function.method`, `type.builtin`, `keyword`, etc. In order to decide what *color* should be used for rendering each highlight, a *theme* is needed. - -In your config file, the `"theme"` value is an object whose keys are dot-separated highlight names like `function.builtin` or `keyword`, and whose values are JSON expressions that represent text styling parameters. - -### Parse Theme - -The Tree-sitter `parse` command will output a pretty-printed CST when the `--cst` option is used. You can control which colors are used for various parts of the tree in your configuration file. Note that omitting a field will cause the relevant text to be rendered with its default color. - -```json5 -{ - "parse-theme": { - // The color of node kinds - "node-kind": [20, 20, 20], - // The color of text associated with a node - "node-text": [255, 255, 255], - // The color of node fields - "field": [42, 42, 42], - // The color of the range information for unnamed nodes - "row-color": [255, 255, 255], - // The color of the range information for named nodes - "row-color-named": [255, 130, 0], - // The color of extra nodes - "extra": [255, 0, 255], - // The color of ERROR nodes - "error": [255, 0, 0], - // The color of MISSING nodes and their associated text - "missing": [153, 75, 0], - // The color of newline characters - "line-feed": [150, 150, 150], - // The color of backtick characters - "backtick": [0, 200, 0], - // The color of literals - "literal": [0, 0, 200], - } -} -``` - -#### Highlight Names - -A theme can contain multiple keys that share a common subsequence. Examples: - -* `variable` and `variable.parameter` -* `function`, `function.builtin`, and `function.method` - -For a given highlight produced, styling will be determined based on the **longest matching theme key**. For example, the highlight `function.builtin.static` would match the key `function.builtin` rather than `function`. - -#### Styling Values - -Styling values can be any of the following: - -* Integers from 0 to 255, representing ANSI terminal color ids. -* Strings like `"#e45649"` representing hexadecimal RGB colors. -* Strings naming basic ANSI colors like `"red"`, `"black"`, `"purple"`, or `"cyan"`. -* Objects with the following keys: - * `color` - An integer or string as described above. - * `underline` - A boolean indicating whether the text should be underlined. - * `italic` - A boolean indicating whether the text should be italicized. - * `bold` - A boolean indicating whether the text should be bold-face. - -## Language Configuration - -The `tree-sitter.json` file is used by the Tree-sitter CLI. Within this file, the CLI looks for data nested under the top-level `"grammars"` key. This key is expected to contain an array of objects with the following keys: - -### Basics - -These keys specify basic information about the parser: - -* `scope` (required) - A string like `"source.js"` that identifies the language. Currently, we strive to match the scope names used by popular [TextMate grammars](https://macromates.com/manual/en/language_grammars) and by the [Linguist](https://github.com/github/linguist) library. - -* `path` (optional) - A relative path from the directory containing `tree-sitter.json` to another directory containing the `src/` folder, which contains the actual generated parser. The default value is `"."` (so that `src/` is in the same folder as `tree-sitter.json`), and this very rarely needs to be overridden. - -* `external-files` (optional) - A list of relative paths from the root dir of a -parser to files that should be checked for modifications during recompilation. -This is useful during development to have changes to other files besides scanner.c -be picked up by the cli. - -### Language Detection - -These keys help to decide whether the language applies to a given file: - -* `file-types` - An array of filename suffix strings. The grammar will be used for files whose names end with one of these suffixes. Note that the suffix may match an *entire* filename. - -* `first-line-regex` - A regex pattern that will be tested against the first line of a file in order to determine whether this language applies to the file. If present, this regex will be used for any file whose language does not match any grammar's `file-types`. - -* `content-regex` - A regex pattern that will be tested against the contents of the file in order to break ties in cases where multiple grammars matched the file using the above two criteria. If the regex matches, this grammar will be preferred over another grammar with no `content-regex`. If the regex does not match, a grammar with no `content-regex` will be preferred over this one. - -* `injection-regex` - A regex pattern that will be tested against a *language name* in order to determine whether this language should be used for a potential *language injection* site. Language injection is described in more detail in [a later section](#language-injection). - -### Query Paths - -These keys specify relative paths from the directory containing `tree-sitter.json` to the files that control syntax highlighting: - -* `highlights` - Path to a *highlight query*. Default: `queries/highlights.scm` -* `locals` - Path to a *local variable query*. Default: `queries/locals.scm`. -* `injections` - Path to an *injection query*. Default: `queries/injections.scm`. - -The behaviors of these three files are described in the next section. - -### Example - -Typically, the `"tree-sitter"` array only needs to contain one object, which only needs to specify a few keys: - -```json -{ - "tree-sitter": [ - { - "scope": "source.ruby", - "file-types": [ - "rb", - "gemspec", - "Gemfile", - "Rakefile" - ], - "first-line-regex": "#!.*\\bruby$" - } - ] -} -``` - -## Queries - -Tree-sitter's syntax highlighting system is based on *tree queries*, which are a general system for pattern-matching on Tree-sitter's syntax trees. See [this section](./using-parsers#pattern-matching-with-queries) of the documentation for more information about tree queries. - -Syntax highlighting is controlled by *three* different types of query files that are usually included in the `queries` folder. The default names for the query files use the `.scm` file. We chose this extension because it commonly used for files written in [Scheme](https://en.wikipedia.org/wiki/Scheme_%28programming_language%29), a popular dialect of Lisp, and these query files use a Lisp-like syntax. - -Alternatively, you can think of `.scm` as an acronym for "Source Code Matching". - -### Highlights - -The most important query is called the highlights query. The highlights query uses *captures* to assign arbitrary *highlight names* to different nodes in the tree. Each highlight name can then be mapped to a color (as described [above](#theme)). Commonly used highlight names include `keyword`, `function`, `type`, `property`, and `string`. Names can also be dot-separated like `function.builtin`. - -#### Example Input - -For example, consider the following Go code: - -```go -func increment(a int) int { - return a + 1 -} -``` - -With this syntax tree: - -```scheme -(source_file - (function_declaration - name: (identifier) - parameters: (parameter_list - (parameter_declaration - name: (identifier) - type: (type_identifier))) - result: (type_identifier) - body: (block - (return_statement - (expression_list - (binary_expression - left: (identifier) - right: (int_literal))))))) -``` - -#### Example Query - -Suppose we wanted to render this code with the following colors: - -* keywords `func` and `return` in purple -* function `increment` in blue -* type `int` in green -* number `5` brown - -We can assign each of these categories a *highlight name* using a query like this: - -```scheme -; highlights.scm - -"func" @keyword -"return" @keyword -(type_identifier) @type -(int_literal) @number -(function_declaration name: (identifier) @function) -``` - -Then, in our config file, we could map each of these highlight names to a color: - -```json -{ - "theme": { - "keyword": "purple", - "function": "blue", - "type": "green", - "number": "brown" - } -} -``` - -#### Result - -Running `tree-sitter highlight` on this Go file would produce output like this: - -
-func increment(a int) int {
-    return a + 1
-}
-
- -### Local Variables - -Good syntax highlighting helps the reader to quickly distinguish between the different types of *entities* in their code. Ideally, if a given entity appears in *multiple* places, it should be colored the same in each place. The Tree-sitter syntax highlighting system can help you to achieve this by keeping track of local scopes and variables. - -The *local variables* query is different from the highlights query in that, while the highlights query uses *arbitrary* capture names which can then be mapped to colors, the locals variable query uses a fixed set of capture names, each of which has a special meaning. - -The capture names are as follows: - -* `@local.scope` - indicates that a syntax node introduces a new local scope. -* `@local.definition` - indicates that a syntax node contains the *name* of a definition within the current local scope. -* `@local.reference` - indicates that a syntax node contains the *name* which *may* refer to an earlier definition within some enclosing scope. - -When highlighting a file, Tree-sitter will keep track of the set of scopes that contains any given position, and the set of definitions within each scope. When processing a syntax node that is captured as a `local.reference`, Tree-sitter will try to find a definition for a name that matches the node's text. If it finds a match, Tree-sitter will ensure that the *reference* and the *definition* are colored the same. - -The information produced by this query can also be *used* by the highlights query. You can *disable* a pattern for nodes which have been identified as local variables by adding the predicate `(#is-not? local)` to the pattern. This is used in the example below: - -#### Example Input - -Consider this Ruby code: - -```ruby -def process_list(list) - context = current_context - list.map do |item| - process_item(item, context) - end -end - -item = 5 -list = [item] -``` - -With this syntax tree: - -```scheme -(program - (method - name: (identifier) - parameters: (method_parameters - (identifier)) - (assignment - left: (identifier) - right: (identifier)) - (method_call - method: (call - receiver: (identifier) - method: (identifier)) - block: (do_block - (block_parameters - (identifier)) - (method_call - method: (identifier) - arguments: (argument_list - (identifier) - (identifier)))))) - (assignment - left: (identifier) - right: (integer)) - (assignment - left: (identifier) - right: (array - (identifier)))) -``` - -There are several different types of names within this method: - -* `process_list` is a method. -* Within this method, `list` is a formal parameter -* `context` is a local variable. -* `current_context` is *not* a local variable, so it must be a method. -* Within the `do` block, `item` is a formal parameter -* Later on, `item` and `list` are both local variables (not formal parameters). - -#### Example Queries - -Let's write some queries that let us clearly distinguish between these types of names. First, set up the highlighting query, as described in the previous section. We'll assign distinct colors to method calls, method definitions, and formal parameters: - -```scheme -; highlights.scm - -(call method: (identifier) @function.method) -(method_call method: (identifier) @function.method) - -(method name: (identifier) @function.method) - -(method_parameters (identifier) @variable.parameter) -(block_parameters (identifier) @variable.parameter) - -((identifier) @function.method - (#is-not? local)) -``` - -Then, we'll set up a local variable query to keep track of the variables and scopes. Here, we're indicating that methods and blocks create local *scopes*, parameters and assignments create *definitions*, and other identifiers should be considered *references*: - -```scheme -; locals.scm - -(method) @local.scope -(do_block) @local.scope - -(method_parameters (identifier) @local.definition) -(block_parameters (identifier) @local.definition) - -(assignment left:(identifier) @local.definition) - -(identifier) @local.reference -``` - -#### Result - -Running `tree-sitter highlight` on this ruby file would produce output like this: - -
-def process_list(list)
-  context = current_context
-  list.map do |item|
-    process_item(item, context)
-  end
-end
-
-item = 5
-list = [item]
-
- -### Language Injection - -Some source files contain code written in multiple different languages. Examples include: - -* HTML files, which can contain JavaScript inside of ` - -{% if jekyll.environment == "development" %} - - -{% else %} - - -{% endif %} - - - diff --git a/docs/section-8-code-navigation-systems.md b/docs/section-8-code-navigation-systems.md deleted file mode 100644 index 04346e46..00000000 --- a/docs/section-8-code-navigation-systems.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -title: Code Navigation Systems -permalink: code-navigation-systems ---- - -# Code Navigation Systems - -Tree-sitter can be used in conjunction with its [tree query language](https://tree-sitter.github.io/tree-sitter/using-parsers#pattern-matching-with-queries) as a part of code navigation systems. An example of such a system can be seen in the `tree-sitter tags` command, which emits a textual dump of the interesting syntactic nodes in its file argument. A notable application of this is GitHub's support for [search-based code navigation](https://docs.github.com/en/repositories/working-with-files/using-files/navigating-code-on-github#precise-and-search-based-navigation). This document exists to describe how to integrate with such systems, and how to extend this functionality to any language with a Tree-sitter grammar. - -## Tagging and captures - -_Tagging_ is the act of identifying the entities that can be named in a program. We use Tree-sitter queries to find those entities. Having found them, you use a syntax capture to label the entity and its name. - -The essence of a given tag lies in two pieces of data: the _role_ of the entity that is matched (i.e. whether it is a definition or a reference) and the _kind_ of that entity, which describes how the entity is used (i.e. whether it's a class definition, function call, variable reference, and so on). Our convention is to use a syntax capture following the `@role.kind` capture name format, and another inner capture, always called `@name`, that pulls out the name of a given identifier. - -You may optionally include a capture named `@doc` to bind a docstring. For convenience purposes, the tagging system provides two built-in functions, `#select-adjacent!` and `#strip!` that are convenient for removing comment syntax from a docstring. `#strip!` takes a capture as its first argument and a regular expression as its second, expressed as a quoted string. Any text patterns matched by the regular expression will be removed from the text associated with the passed capture. `#select-adjacent!`, when passed two capture names, filters the text associated with the first capture so that only nodes adjacent to the second capture are preserved. This can be useful when writing queries that would otherwise include too much information in matched comments. - -## Examples - -This [query](https://github.com/tree-sitter/tree-sitter-python/blob/78c4e9b6b2f08e1be23b541ffced47b15e2972ad/queries/tags.scm#L4-L5) recognizes Python function definitions and captures their declared name. The `function_definition` syntax node is defined in the [Python Tree-sitter grammar](https://github.com/tree-sitter/tree-sitter-python/blob/78c4e9b6b2f08e1be23b541ffced47b15e2972ad/grammar.js#L354). - -```scheme -(function_definition - name: (identifier) @name) @definition.function -``` - -A more sophisticated query can be found in the [JavaScript Tree-sitter repository](https://github.com/tree-sitter/tree-sitter-javascript/blob/fdeb68ac8d2bd5a78b943528bb68ceda3aade2eb/queries/tags.scm#L63-L70): - -```scheme -(assignment_expression - left: [ - (identifier) @name - (member_expression - property: (property_identifier) @name) - ] - right: [(arrow_function) (function)] -) @definition.function -``` - -An even more sophisticated query is in the [Ruby Tree-sitter repository](https://github.com/tree-sitter/tree-sitter-ruby/blob/1ebfdb288842dae5a9233e2509a135949023dd82/queries/tags.scm#L24-L43), which uses built-in functions to strip the Ruby comment character (`#`) from the docstrings associated with a class or singleton-class declaration, then selects only the docstrings adjacent to the node matched as `@definition.class`. - -```scheme -( - (comment)* @doc - . - [ - (class - name: [ - (constant) @name - (scope_resolution - name: (_) @name) - ]) @definition.class - (singleton_class - value: [ - (constant) @name - (scope_resolution - name: (_) @name) - ]) @definition.class - ] - (#strip! @doc "^#\\s*") - (#select-adjacent! @doc @definition.class) -) -``` - -The below table describes a standard vocabulary for kinds and roles during the tagging process. New applications may extend (or only recognize a subset of) these capture names, but it is desirable to standardize on the names below. - -| Category | Tag | -|--------------------------|-----------------------------| -| Class definitions | `@definition.class` | -| Function definitions | `@definition.function` | -| Interface definitions | `@definition.interface` | -| Method definitions | `@definition.method` | -| Module definitions | `@definition.module` | -| Function/method calls | `@reference.call` | -| Class reference | `@reference.class` | -| Interface implementation | `@reference.implementation` | - -## Command-line invocation - -You can use the `tree-sitter tags` command to test out a tags query file, passing as arguments one or more files to tag. We can run this tool from within the Tree-sitter Ruby repository, over code in a file called `test.rb`: - -```ruby -module Foo - class Bar - # won't be included - - # is adjacent, will be - def baz - end - end -end -``` - -Invoking `tree-sitter tags test.rb` produces the following console output, representing matched entities' name, role, location, first line, and docstring: - -```text - test.rb - Foo | module def (0, 7) - (0, 10) `module Foo` - Bar | class def (1, 8) - (1, 11) `class Bar` - baz | method def (2, 8) - (2, 11) `def baz` "is adjacent, will be" -``` - -It is expected that tag queries for a given language are located at `queries/tags.scm` in that language's repository. - -## Unit Testing - -Tags queries may be tested with `tree-sitter test`. Files under `test/tags/` are checked using the same comment system as [highlights queries](https://tree-sitter.github.io/tree-sitter/syntax-highlighting#unit-testing). For example, the above Ruby tags can be tested with these comments: - -```ruby -module Foo - # ^ definition.module - class Bar - # ^ definition.class - - def baz - # ^ definition.method - end - end -end -``` diff --git a/docs/src/3-syntax-highlighting.md b/docs/src/3-syntax-highlighting.md new file mode 100644 index 00000000..09db494e --- /dev/null +++ b/docs/src/3-syntax-highlighting.md @@ -0,0 +1,433 @@ +# Syntax Highlighting + +Syntax highlighting is a very common feature in applications that deal with code. Tree-sitter has built-in support for +syntax highlighting via the [`tree-sitter-highlight`][highlight crate] library, which is now used on GitHub.com for highlighting +code written in several languages. You can also perform syntax highlighting at the command line using the +`tree-sitter highlight` command. + +This document explains how the Tree-sitter syntax highlighting system works, using the command line interface. If you are +using `tree-sitter-highlight` library (either from C or from Rust), all of these concepts are still applicable, but the +configuration data is provided using in-memory objects, rather than files. + +## Overview + +All the files needed to highlight a given language are normally included in the same git repository as the Tree-sitter +grammar for that language (for example, [`tree-sitter-javascript`][js grammar], [`tree-sitter-ruby`][ruby grammar]). +To run syntax highlighting from the command-line, three types of files are needed: + +1. Per-user configuration in `~/.config/tree-sitter/config.json` (see the [init-config][init-config] page for more info). +2. Language configuration in grammar repositories' `tree-sitter.json` files (see the [init][init] page for more info). +3. Tree queries in the grammars repositories' `queries` folders. + +For an example of the language-specific files, see the [`tree-sitter.json` file][ts json] and [`queries` directory][queries] +in the `tree-sitter-ruby` repository. The following sections describe the behavior of each file. + +## Language Configuration + +The `tree-sitter.json` file is used by the Tree-sitter CLI. Within this file, the CLI looks for data nested under the +top-level `"grammars"` key. This key is expected to contain an array of objects with the following keys: + +### Basics + +These keys specify basic information about the parser: + +- `scope` (required) — A string like `"source.js"` that identifies the language. We strive to match the scope names used +by popular [TextMate grammars][textmate] and by the [Linguist][linguist] library. + +- `path` (optional) — A relative path from the directory containing `tree-sitter.json` to another directory containing +the `src/` folder, which contains the actual generated parser. The default value is `"."` (so that `src/` is in the same +folder as `tree-sitter.json`), and this very rarely needs to be overridden. + +- `external-files` (optional) — A list of relative paths from the root dir of a +parser to files that should be checked for modifications during recompilation. +This is useful during development to have changes to other files besides scanner.c +be picked up by the cli. + +### Language Detection + +These keys help to decide whether the language applies to a given file: + +- `file-types` — An array of filename suffix strings. The grammar will be used for files whose names end with one of these +suffixes. Note that the suffix may match an *entire* filename. + +- `first-line-regex` — A regex pattern that will be tested against the first line of a file to determine whether this language +applies to the file. If present, this regex will be used for any file whose language does not match any grammar's `file-types`. + +- `content-regex` — A regex pattern that will be tested against the contents of the file to break ties in cases where +multiple grammars matched the file using the above two criteria. If the regex matches, this grammar will be preferred over +another grammar with no `content-regex`. If the regex does not match, a grammar with no `content-regex` will be preferred +over this one. + +- `injection-regex` — A regex pattern that will be tested against a *language name* ito determine whether this language +should be used for a potential *language injection* site. Language injection is described in more detail in [a later section](#language-injection). + +### Query Paths + +These keys specify relative paths from the directory containing `tree-sitter.json` to the files that control syntax highlighting: + +- `highlights` — Path to a *highlight query*. Default: `queries/highlights.scm` +- `locals` — Path to a *local variable query*. Default: `queries/locals.scm`. +- `injections` — Path to an *injection query*. Default: `queries/injections.scm`. + +The behaviors of these three files are described in the next section. + +## Queries + +Tree-sitter's syntax highlighting system is based on *tree queries*, which are a general system for pattern-matching on Tree-sitter's +syntax trees. See [this section][pattern matching] of the documentation for more information +about tree queries. + +Syntax highlighting is controlled by *three* different types of query files that are usually included in the `queries` folder. +The default names for the query files use the `.scm` file. We chose this extension because it commonly used for files written +in [Scheme][scheme], a popular dialect of Lisp, and these query files use a Lisp-like syntax. + +### Highlights + +The most important query is called the highlights query. The highlights query uses *captures* to assign arbitrary +*highlight names* to different nodes in the tree. Each highlight name can then be mapped to a color +(as described in the [init-config command][theme]). Commonly used highlight names include +`keyword`, `function`, `type`, `property`, and `string`. Names can also be dot-separated like `function.builtin`. + +#### Example Go Snippet + +For example, consider the following Go code: + +```go +func increment(a int) int { + return a + 1 +} +``` + +With this syntax tree: + +```scheme +(source_file + (function_declaration + name: (identifier) + parameters: (parameter_list + (parameter_declaration + name: (identifier) + type: (type_identifier))) + result: (type_identifier) + body: (block + (return_statement + (expression_list + (binary_expression + left: (identifier) + right: (int_literal))))))) +``` + +#### Example Query + +Suppose we wanted to render this code with the following colors: + +- keywords `func` and `return` in purple +- function `increment` in blue +- type `int` in green +- number `5` brown + +We can assign each of these categories a *highlight name* using a query like this: + +```scheme +; highlights.scm + +"func" @keyword +"return" @keyword +(type_identifier) @type +(int_literal) @number +(function_declaration name: (identifier) @function) +``` + +Then, in our config file, we could map each of these highlight names to a color: + +```json +{ + "theme": { + "keyword": "purple", + "function": "blue", + "type": "green", + "number": "brown" + } +} +``` + +#### Highlights Result + +Running `tree-sitter highlight` on this Go file would produce output like this: + +
+func increment(a int) int {
+    return a + 1
+}
+
+ +### Local Variables + +Good syntax highlighting helps the reader to quickly distinguish between the different types of *entities* in their code. +Ideally, if a given entity appears in *multiple* places, it should be colored the same in each place. The Tree-sitter syntax +highlighting system can help you to achieve this by keeping track of local scopes and variables. + +The *local variables* query is different from the highlights query in that, while the highlights query uses *arbitrary* +capture names, which can then be mapped to colors, the locals variable query uses a fixed set of capture names, each of +which has a special meaning. + +The capture names are as follows: + +- `@local.scope` — indicates that a syntax node introduces a new local scope. +- `@local.definition` — indicates that a syntax node contains the *name* of a definition within the current local scope. +- `@local.reference` — indicates that a syntax node contains the *name*, which *may* refer to an earlier definition within +some enclosing scope. + +When highlighting a file, Tree-sitter will keep track of the set of scopes that contains any given position, and the set +of definitions within each scope. When processing a syntax node that is captured as a `local.reference`, Tree-sitter will +try to find a definition for a name that matches the node's text. If it finds a match, Tree-sitter will ensure that the +*reference*, and the *definition* are colored the same. + +The information produced by this query can also be *used* by the highlights query. You can *disable* a pattern for nodes, +which have been identified as local variables by adding the predicate `(#is-not? local)` to the pattern. This is used in +the example below: + +#### Example Ruby Snippet + +Consider this Ruby code: + +```ruby +def process_list(list) + context = current_context + list.map do |item| + process_item(item, context) + end +end + +item = 5 +list = [item] +``` + +With this syntax tree: + +```scheme +(program + (method + name: (identifier) + parameters: (method_parameters + (identifier)) + (assignment + left: (identifier) + right: (identifier)) + (method_call + method: (call + receiver: (identifier) + method: (identifier)) + block: (do_block + (block_parameters + (identifier)) + (method_call + method: (identifier) + arguments: (argument_list + (identifier) + (identifier)))))) + (assignment + left: (identifier) + right: (integer)) + (assignment + left: (identifier) + right: (array + (identifier)))) +``` + +There are several types of names within this method: + +- `process_list` is a method. +- Within this method, `list` is a formal parameter +- `context` is a local variable. +- `current_context` is *not* a local variable, so it must be a method. +- Within the `do` block, `item` is a formal parameter +- Later on, `item` and `list` are both local variables (not formal parameters). + +#### Example Queries + +Let's write some queries that let us clearly distinguish between these types of names. First, set up the highlighting query, +as described in the previous section. We'll assign distinct colors to method calls, method definitions, and formal parameters: + +```scheme +; highlights.scm + +(call method: (identifier) @function.method) +(method_call method: (identifier) @function.method) + +(method name: (identifier) @function.method) + +(method_parameters (identifier) @variable.parameter) +(block_parameters (identifier) @variable.parameter) + +((identifier) @function.method + (#is-not? local)) +``` + +Then, we'll set up a local variable query to keep track of the variables and scopes. Here, we're indicating that methods +and blocks create local *scopes*, parameters and assignments create *definitions*, and other identifiers should be considered +*references*: + +```scheme +; locals.scm + +(method) @local.scope +(do_block) @local.scope + +(method_parameters (identifier) @local.definition) +(block_parameters (identifier) @local.definition) + +(assignment left:(identifier) @local.definition) + +(identifier) @local.reference +``` + +#### Locals Result + +Running `tree-sitter highlight` on this ruby file would produce output like this: + +
+def process_list(list)
+  context = current_context
+  list.map do |item|
+    process_item(item, context)
+  end
+end
+
+item = 5
+list = [item]
+
+ +### Language Injection + +Some source files contain code written in multiple different languages. Examples include: + +- HTML files, which can contain JavaScript inside ` + + + + + + diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md new file mode 100644 index 00000000..f7d6331b --- /dev/null +++ b/docs/src/SUMMARY.md @@ -0,0 +1,54 @@ +# Summary + +Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source +file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be: + +General enough to parse any programming language +Fast enough to parse on every keystroke in a text editor +Robust enough to provide useful results even in the presence of syntax errors +Dependency-free so that the runtime library (which is written in pure C) can be embedded in any application + +[Introduction](./index.md) + +# User Guide + +- [Using Parsers](./using-parsers/index.md) + - [Getting Started](./using-parsers/1-getting-started.md) + - [Basic Parsing](./using-parsers/2-basic-parsing.md) + - [Advanced Parsing](./using-parsers/3-advanced-parsing.md) + - [Walking Trees](./using-parsers/4-walking-trees.md) + - [Queries](./using-parsers/queries/index.md) + - [Basic Syntax](./using-parsers/queries/1-syntax.md) + - [Operators](./using-parsers/queries/2-operators.md) + - [Predicates and Directives](./using-parsers/queries/3-predicates-and-directives.md) + - [API](./using-parsers/queries/4-api.md) + - [Static Node Types](./using-parsers/6-static-node-types.md) +- [Creating Parsers](./creating-parsers/index.md) + - [Getting Started](./creating-parsers/1-getting-started.md) + - [The Grammar DSL](./creating-parsers/2-the-grammar-dsl.md) + - [Writing the Grammar](./creating-parsers/3-writing-the-grammar.md) + - [External Scanners](./creating-parsers/4-external-scanners.md) + - [Writing Tests](./creating-parsers/5-writing-tests.md) +- [Syntax Highlighting](./3-syntax-highlighting.md) +- [Code Navigation](./4-code-navigation.md) +- [Implementation](./5-implementation.md) +- [Contributing](./6-contributing.md) +- [Playground](./7-playground.md) + +# Reference Guide + +- [Command Line Interface](./cli/index.md) + - [Init Config](./cli/init-config.md) + - [Init](./cli/init.md) + - [Generate](./cli/generate.md) + - [Build](./cli/build.md) + - [Parse](./cli/parse.md) + - [Test](./cli/test.md) + - [Version](./cli/version.md) + - [Fuzz](./cli/fuzz.md) + - [Query](./cli/query.md) + - [Highlight](./cli/highlight.md) + - [Tags](./cli/tags.md) + - [Playground](./cli/playground.md) + - [Dump Languages](./cli/dump-languages.md) + - [Complete](./cli/complete.md) diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md new file mode 100644 index 00000000..180e7f92 --- /dev/null +++ b/docs/src/cli/build.md @@ -0,0 +1,43 @@ +# `tree-sitter build` + +The `build` command compiles your parser into a dynamically-loadable library, +either as a shared object (`.so`, `.dylib`, or `.dll`) or as a WASM module. + +```bash +tree-sitter build [OPTIONS] [PATH] # Aliases: b +``` + +You can change the compiler executable via the `CC` environment variable and add extra flags via `CFLAGS`. +For macOS or iOS, you can set `MACOSX_DEPLOYMENT_TARGET` or `IPHONEOS_DEPLOYMENT_TARGET` respectively to define the +minimum supported version. + +The path argument allows you to specify the directory of the parser to build. If you don't supply this argument, the CLI +will attempt to build the parser in the current working directory. + +## Options + +### `-w/--wasm` + +Compile the parser as a WASM module. + +### `-d/--docker` + +Use Docker or Podman to supply Emscripten. This removes the need to install Emscripten on your machine locally. +Note that this flag is only available when compiling to WASM. + +### `-o/--output` + +Specify where to output the shared object file (native or WASM). This flag accepts either an absolute path or a relative +path. If you don't supply this flag, the CLI will attempt to figure out what the language name is based on the parent +directory name to use for the output file. If the CLI can't figure it out, it will default to `parser`, thus generating +`parser.so` or `parser.wasm` in the current working directory. + +### `--reuse-allocator` + +Reuse the allocator that's set in the core library for the parser's external scanner. This is useful in applications +where the author overrides the default allocator with their own, and wants to ensure every parser that allocates memory +in the external scanner does so using their allocator. + +### `-0/--debug` + +Compile the parser with debug flags enabled. This is useful when debugging issues that require a debugger like `gdb` or `lldb`. diff --git a/docs/src/cli/complete.md b/docs/src/cli/complete.md new file mode 100644 index 00000000..4c9aabfc --- /dev/null +++ b/docs/src/cli/complete.md @@ -0,0 +1,16 @@ +# `tree-sitter complete` + +The `complete` command generates a completion script for your shell. +This script can be used to enable autocompletion for the `tree-sitter` CLI. + +```bash +tree-sitter complete --shell # Aliases: comp +``` + +## Options + +### `--shell ` + +The shell for which to generate the completion script. + +Supported values: `bash`, `elvish`, `fish`, `power-shell`, `zsh`, and `nushell`. diff --git a/docs/src/cli/dump-languages.md b/docs/src/cli/dump-languages.md new file mode 100644 index 00000000..1d1a6aaa --- /dev/null +++ b/docs/src/cli/dump-languages.md @@ -0,0 +1,15 @@ +# `tree-sitter dump-languages` + +The `dump-languages` command prints out a list of all the languages that the CLI knows about. This can be useful for debugging purposes, or for scripting. The paths to search comes from the config file's [`parser-directories`][parser-directories] object. + +```bash +tree-sitter dump-languages [OPTIONS] # Aliases: langs +``` + +## Options + +### `--config-path` + +The path to the configuration file. Ordinarily, the CLI will use the default location as explained in the [init-config](./init-config.md) command. This flag allows you to explicitly override that default, and use a config defined elsewhere. + +[parser-directories]: ./init-config.md#parser-directories diff --git a/docs/src/cli/fuzz.md b/docs/src/cli/fuzz.md new file mode 100644 index 00000000..1f79bc00 --- /dev/null +++ b/docs/src/cli/fuzz.md @@ -0,0 +1,49 @@ +# `tree-sitter fuzz` + +The `fuzz` command is used to fuzz a parser by performing random edits and ensuring that undoing these edits results in +consistent parse trees. It will fail if the parse trees are not equal, or if the changed ranges are inconsistent. + +```bash +tree-sitter fuzz [OPTIONS] # Aliases: f +``` + +## Options + +### `-s/--skip ` + +A list of test names to skip fuzzing. + +### `--subdir ` + +The directory containing the parser. This is primarily useful in multi-language repositories. + +### `--edits ` + +The maximum number of edits to perform. The default is 3. + +### `--iterations ` + +The number of iterations to run. The default is 10. + +### `-i/--include ` + +Only run tests whose names match this regex. + +### `-e/--exclude ` + +Skip tests whose names match this regex. + +### `--log-graphs` + +Outputs logs of the graphs of the stack and parse trees during parsing, as well as the actual parsing and lexing message. +The graphs are constructed with [graphviz dot][dot], and the output is written to `log.html`. + +### `-l/--log` + +Outputs parsing and lexing logs. This logs to stderr. + +### `-r/--rebuild` + +Force a rebuild of the parser before running the fuzzer. + +[dot]: https://graphviz.org/doc/info/lang.html diff --git a/docs/src/cli/generate.md b/docs/src/cli/generate.md new file mode 100644 index 00000000..3b2a94d3 --- /dev/null +++ b/docs/src/cli/generate.md @@ -0,0 +1,62 @@ +# `tree-sitter generate` + +The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current +working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, +just run `tree-sitter generate` again. + +```bash +tree-sitter generate [OPTIONS] [GRAMMAR_PATH] # Aliases: gen, g +``` + +The grammar path argument allows you to specify a path to a `grammar.js` JavaScript file, or `grammar.json` JSON file. +In case your `grammar.js` file is in a non-standard path, you can specify it yourself. But, if you are using a parser +where `grammar.json` was already generated, or it was hand-written, you can tell the CLI to generate the parser *based* +on this JSON file. This avoids relying on a JavaScript file and avoids the dependency on a JavaScript runtime. + +If there is an ambiguity or *local ambiguity* in your grammar, Tree-sitter will detect it during parser generation, and +it will exit with a `Unresolved conflict` error message. To learn more about conflicts and how to handle them, check out +the section on [`Structuring Rules Well`](../creating-parsers/3-writing-the-grammar.md#structuring-rules-well) +in the user guide. + +## Options + +### `-l/--log` + +Print the log of the parser generation process. This is really only useful if you know what you're doing, or are investigating +a bug in the CLI itself. It logs info such as what tokens are included in the error recovery state, +what keywords were extracted, what states were split and why, and the entry point state. + +### `--abi ` + +The ABI to use for parser generation. The default is ABI 15, with ABI 14 being a supported target. + +### `-b/--build` + +Compile all defined languages in the current directory. The cli will automatically compile the parsers after generation, +and place them in the cache dir. + +### `-0/--debug-build` + +Compile the parser with debug flags enabled. This is useful when debugging issues that require a debugger like `gdb` or `lldb`. + +### `--libdir ` + +The directory to place the compiled parser(s) in. +On Unix systems, the default path is `$XDG_CACHE_HOME/tree-sitter` if `$XDG_CACHE_HOME` is set, +otherwise `$HOME/.config/tree-sitter` is used. On Windows, the default path is `%LOCALAPPDATA%\tree-sitter` if available, +otherwise `$HOME\AppData\Local\tree-sitter` is used. + +### `-o/--output` + +The directory to place the generated parser in. The default is `src/` in the current directory. + +### `--report-states-for-rule ` + +Print the overview of states from the given rule. This is useful for debugging and understanding the generated parser's +item sets for all given states in a given rule. To solely view state count numbers for rules, pass in `-` for the rule argument. +To view the overview of states for every rule, pass in `*` for the rule argument. + +### `--js-runtime ` + +The path to the JavaScript runtime executable to use when generating the parser. The default is `node`. +Note that you can also set this with `TREE_SITTER_JS_RUNTIME`. diff --git a/docs/src/cli/highlight.md b/docs/src/cli/highlight.md new file mode 100644 index 00000000..fdd661e3 --- /dev/null +++ b/docs/src/cli/highlight.md @@ -0,0 +1,51 @@ +# `tree-sitter highlight` + +You can run syntax highlighting on an arbitrary file using `tree-sitter highlight`. This can either output colors directly +to your terminal using ANSI escape codes, or produce HTML (if the `--html` flag is passed). For more information, see +[the syntax highlighting page](../3-syntax-highlighting.md). + +```bash +tree-sitter highlight [OPTIONS] [PATHS]... # Aliases: hi +``` + +## Options + +### `-H/--html` + +Output an HTML document with syntax highlighting. + +### `--css-classes` + +Output HTML with CSS classes instead of inline styles. + +### `--check` + +Check that the highlighting captures conform strictly to the standards. + +### `--captures-path ` + +The path to a file with captures. These captures would be considered the "standard" captures to compare against. + +### `--query-paths ` + +The paths to query files to use for syntax highlighting. These should end in `highlights.scm`. + +### `--scope ` + +The language scope to use for syntax highlighting. This is useful when the language is ambiguous. + +### `-t/--time` + +Print the time taken to highlight the file. + +### `-q/--quiet` + +Suppress main output. + +### `--paths ` + +The path to a file that contains paths to source files to highlight + +### `--config-path ` + +The path to an alternative configuration (`config.json`) file. See [the init-config command](./init-config.md) for more information. diff --git a/docs/src/cli/index.md b/docs/src/cli/index.md new file mode 100644 index 00000000..7c982b40 --- /dev/null +++ b/docs/src/cli/index.md @@ -0,0 +1,4 @@ +# CLI Overview + +Let's go over all of the functionality of the `tree-sitter` command line interface. +Once you feel that you have enough of a grasp on the CLI, you can move onto the grammar authoring section to learn more about writing your own parser. diff --git a/docs/src/cli/init-config.md b/docs/src/cli/init-config.md new file mode 100644 index 00000000..3af0e975 --- /dev/null +++ b/docs/src/cli/init-config.md @@ -0,0 +1,146 @@ +# `tree-sitter init-config` + +This command initializes a configuration file for the Tree-sitter CLI. + +```bash +tree-sitter init-config +``` + +These directories are created in the "default" location for your platform: + +* On Unix, `$XDG_CONFIG_HOME/tree-sitter` or `$HOME/.config/tree-sitter` +* On Windows, `%APPDATA%\tree-sitter` or `$HOME\AppData\Roaming\tree-sitter` + +> Note that the CLI will work if there's no config file present, falling back on default values > for each configuration +> option. + +When you run the `init-config` command, it will print out the location of the file that it creates so that you can easily +find and modify it. + +The configuration file is a JSON file that contains the following fields: + +## `parser-directories` + +The [`tree-sitter highlight`](./highlight.md) command takes one or more file paths, and tries to automatically determine, +which language should be used to highlight those files. To do this, it needs to know *where* to look for Tree-sitter grammars +on your filesystem. You can control this using the `"parser-directories"` key in your configuration file: + +```json +{ + "parser-directories": [ + "/Users/my-name/code", + "/Users/my-name/other-code" + ] +} +``` + +Any folder within one of these *parser directories* whose name begins with `tree-sitter-` will be treated as a Tree-sitter +grammar repository. + +## `theme` + +The [Tree-sitter highlighting system](../3-syntax-highlighting.md) works by annotating ranges of source code with logical +"highlight names" like `function.method`, `type.builtin`, `keyword`, etc. To decide what *color* should be used for rendering +each highlight, a *theme* is needed. + +In your config file, the `"theme"` value is an object whose keys are dot-separated highlight names like +`function.builtin` or `keyword`, and whose values are JSON expressions that represent text styling parameters. + +### Highlight Names + +A theme can contain multiple keys that share a common subsequence. Examples: + +* `variable` and `variable.parameter` +* `function`, `function.builtin`, and `function.method` + +For a given highlight produced, styling will be determined based on the **longest matching theme key**. For example, the +highlight `function.builtin.static` would match the key `function.builtin` rather than `function`. + +### Styling Values + +Styling values can be any of the following: + +* Integers from 0 to 255, representing ANSI terminal color ids. +* Strings like `"#e45649"` representing hexadecimal RGB colors. +* Strings naming basic ANSI colors like `"red"`, `"black"`, `"purple"`, or `"cyan"`. +* Objects with the following keys: + * `color` — An integer or string as described above. + * `underline` — A boolean indicating whether the text should be underlined. + * `italic` — A boolean indicating whether the text should be italicized. + * `bold` — A boolean indicating whether the text should be bold-face. + +An example theme can be seen below: + +```json +{ + "function": 26, + "operator": { + "bold": true, + "color": 239 + }, + "variable.builtin": { + "bold": true + }, + "variable.parameter": { + "underline": true + }, + "type.builtin": { + "color": 23, + "bold": true + }, + "keyword": 56, + "type": 23, + "number": { + "bold": true, + "color": 94 + }, + "constant": 94, + "attribute": { + "color": 124, + "italic": true + }, + "comment": { + "color": 245, + "italic": true + }, + "constant.builtin": { + "color": 94, + "bold": true + }, +} +``` + +## `parse-theme` + +The [`tree-sitter parse`](./parse.md) command will output a pretty-printed CST when the `-c/--cst` option is used. You can +control what colors are used for various parts of the tree in your configuration file. Note that omitting a field will cause +the relevant text to be rendered with its default color. + +```json +{ + "parse-theme": { + // The color of node kinds + "node-kind": [20, 20, 20], + // The color of text associated with a node + "node-text": [255, 255, 255], + // The color of node fields + "field": [42, 42, 42], + // The color of the range information for unnamed nodes + "row-color": [255, 255, 255], + // The color of the range information for named nodes + "row-color-named": [255, 130, 0], + // The color of extra nodes + "extra": [255, 0, 255], + // The color of ERROR nodes + "error": [255, 0, 0], + // The color of MISSING nodes and their associated text + "missing": [153, 75, 0], + // The color of newline characters + "line-feed": [150, 150, 150], + // The color of backtick characters + "backtick": [0, 200, 0], + // The color of literals + "literal": [0, 0, 200], + } +} +``` diff --git a/docs/src/cli/init.md b/docs/src/cli/init.md new file mode 100644 index 00000000..44e6b380 --- /dev/null +++ b/docs/src/cli/init.md @@ -0,0 +1,190 @@ +# `tree-sitter init` + +The `init` command is your starting point for creating a new grammar. When you run it, it sets up a repository with all +the essential files and structure needed for grammar development. Since the command includes git-related files by default, +we recommend using git for version control of your grammar. + +```bash +tree-sitter init [OPTIONS] # Aliases: i +``` + +## Options + +### `--update` + +Update outdated generated files, if needed. + +## Structure of `tree-sitter.json` + +The main file of interest for users to configure is `tree-sitter.json`, which tells the CLI information about your grammar, +such as the location of queries. + +### The `grammars` field + +This field is an array of objects, though you typically only need one object in this array unless your repo has +multiple grammars (for example, `Typescript` and `TSX`). + +### Example + +Typically, the objects in the `"tree-sitter"` array only needs to specify a few keys: + +```json +{ + "tree-sitter": [ + { + "scope": "source.ruby", + "file-types": [ + "rb", + "gemspec", + "Gemfile", + "Rakefile" + ], + "first-line-regex": "#!.*\\bruby$" + } + ] +} +``` + +#### Basic Fields + +These keys specify basic information about the parser: + +- `scope` (required) — A string like `"source.js"` that identifies the language. +We strive to match the scope names used by popular [TextMate grammars][textmate] and by the [Linguist][linguist] library. + +- `path` — A relative path from the directory containing `tree-sitter.json` to another directory containing the `src/` +folder, which contains the actual generated parser. The default value is `"."` +(so that `src/` is in the same folder as `tree-sitter.json`), and this very rarely needs to be overridden. + +- `external-files` — A list of relative paths from the root dir of a +parser to files that should be checked for modifications during recompilation. +This is useful during development to have changes to other files besides scanner.c +be picked up by the cli. + +#### Language Detection + +These keys help to decide whether the language applies to a given file: + +- `file-types` — An array of filename suffix strings. The grammar will be used for files whose names end with one of +these suffixes. Note that the suffix may match an *entire* filename. + +- `first-line-regex` — A regex pattern that will be tested against the first line of a file +to determine whether this language applies to the file. If present, this regex will be used for any file whose +language does not match any grammar's `file-types`. + +- `content-regex` — A regex pattern that will be tested against the contents of the file +to break ties in cases where multiple grammars matched the file using the above two criteria. If the regex matches, +this grammar will be preferred over another grammar with no `content-regex`. If the regex does not match, a grammar with +no `content-regex` will be preferred over this one. + +- `injection-regex` — A regex pattern that will be tested against a *language name* to determine whether this language +should be used for a potential *language injection* site. +Language injection is described in more detail in [the relevant section](../3-syntax-highlighting.md#language-injection). + +#### Query Paths + +These keys specify relative paths from the directory containing `tree-sitter.json` to the files that control syntax highlighting: + +- `highlights` — Path to a *highlight query*. Default: `queries/highlights.scm` +- `locals` — Path to a *local variable query*. Default: `queries/locals.scm`. +- `injections` — Path to an *injection query*. Default: `queries/injections.scm`. +- `tags` — Path to an *tag query*. Default: `queries/tags.scm`. + +### The `metadata` field + +This field contains information that tree-sitter will use to populate relevant bindings' files, especially their versions. +Typically, this will all be set up when you run `tree-sitter init`, but you are welcome to update it as you see fit. + +- `version` (required) — The current version of your grammar, which should follow [semver][semver] +- `license` — The license of your grammar, which should be a valid [SPDX license][spdx] +- `description` — The brief description of your grammar +- `authors` (required) — An array of objects that contain a `name` field, and optionally an `email` and `url` field. +Each field is a string +- `links` — An object that contains a `repository` field, and optionally a `homepage` field. Each field is a string +- `namespace` — The namespace for the `Java` and `Kotlin` bindings, defaults to `io.github.tree-sitter` if not provided + +### The `bindings` field + +This field controls what bindings are generated when the `init` command is run. +Each key is a language name, and the value is a boolean. + +- `c` (default: `true`) +- `go` (default: `true`) +- `java` (default: `false`) +- `kotlin` (default: `false`) +- `node` (default: `true`) +- `python` (default: `true`) +- `rust` (default: `true`) +- `swift` (default: `false`) + +## Binding Files + +When you run `tree-sitter init`, the CLI will also generate a number of files in your repository that allow for your parser +to be used from different language. Here is a list of these bindings files that are generated, and what their purpose is: + +### C/C++ + +- `Makefile` — This file tells [`make`][make] how to compile your language. +- `CMakeLists.txt` — This file tells [`cmake`][cmake] how to compile your language. +- `bindings/c/tree-sitter-language.h` — This file provides the C interface of your language. +- `bindings/c/tree-sitter-language.pc` — This file provides [pkg-config][pkg-config] metadata about your language's C library. +- `src/tree_sitter/parser.h` — This file provides some basic C definitions that are used in your generated `parser.c` file. +- `src/tree_sitter/alloc.h` — This file provides some memory allocation macros that are to be used in your external scanner, +if you have one. +- `src/tree_sitter/array.h` — This file provides some array macros that are to be used in your external scanner, +if you have one. + +### Go + +- `go.mod` — This file is the manifest of the Go module. +- `bindings/go/binding.go` — This file wraps your language in a Go module. +- `bindings/go/binding_test.go` — This file contains a test for the Go package. + +### Node + +- `binding.gyp` — This file tells Node.js how to compile your language. +- `package.json` — This file is the manifest of the Node.js package. +- `bindings/node/binding.cc` — This file wraps your language in a JavaScript module for Node.js. +- `bindings/node/index.js` — This is the file that Node.js initially loads when using your language. +- `bindings/node/index.d.ts` — This file provides type hints for your parser when used in TypeScript. +- `bindings/node/binding_test.js` — This file contains a test for the Node.js package. + +### Python + +- `pyproject.toml` — This file is the manifest of the Python package. +- `setup.py` — This file tells Python how to compile your language. +- `bindings/python/tree_sitter_language/binding.c` — This file wraps your language in a Python module. +- `bindings/python/tree_sitter_language/__init__.py` — This file tells Python how to load your language. + `bindings/python/tree_sitter_language/__init__.pyi` — This file provides type hints for your parser when used in Python. +- `bindings/python/tree_sitter_language/py.typed` — This file provides type hints for your parser when used in Python. +- `bindings/python/tests/test_binding.py` — This file contains a test for the Python package. + +### Rust + +- `Cargo.toml` — This file is the manifest of the Rust package. +- `bindings/rust/lib.rs` — This file wraps your language in a Rust crate when used in Rust. +- `bindings/rust/build.rs` — This file wraps the building process for the Rust crate. + +### Swift + +- `Package.swift` — This file tells Swift how to compile your language. +- `bindings/swift/TreeSitterLanguage/language.h` — This file wraps your language in a Swift module when used in Swift. +- `bindings/swift/TreeSitterLanguageTests/TreeSitterLanguageTests.swift` — This file contains a test for the Swift package. + +### Additional Files + +Additionally, there's a few other files that are generated when you run `tree-sitter init`, +that aim to improve the development experience: + +- `.editorconfig` — This file tells your editor how to format your code. More information about this file can be found [here][editorconfig] +- `.gitattributes` — This file tells Git how to handle line endings, and tells GitHub what files are generated. +- `.gitignore` — This file tells Git what files to ignore when committing changes. + +[cmake]: https://cmake.org/cmake/help/latest +[editorconfig]: https://editorconfig.org +[linguist]: https://github.com/github/linguist +[make]: https://www.gnu.org/software/make/manual/make.html +[pkg-config]: https://www.freedesktop.org/wiki/Software/pkg-config +[semver]: https://semver.org +[spdx]: https://spdx.org/licenses +[textmate]: https://macromates.com/manual/en/language_grammars diff --git a/docs/src/cli/parse.md b/docs/src/cli/parse.md new file mode 100644 index 00000000..7f30cdb0 --- /dev/null +++ b/docs/src/cli/parse.md @@ -0,0 +1,97 @@ +# `tree-sitter parse` + +The `parse` command parses source files using a Tree-sitter parser. You can pass any number of file paths and glob patterns +to `tree-sitter parse`, and it will parse all the given files. The command will exit with a non-zero status code if any +parse errors occurred. + +```bash +tree-sitter parse [OPTIONS] [PATHS]... # Aliases: p +``` + +## Options + +### `--paths ` + +The path to a file that contains paths to source files to parse. + +### `--scope ` + +The language scope to use for parsing. This is useful when the language is ambiguous. + +### `-d/--debug` + +Outputs parsing and lexing logs. This logs to stderr. + +### `-0/--debug-build` + +Compile the parser with debug flags enabled. This is useful when debugging issues that require a debugger like `gdb` or `lldb`. + +### `-D/--debug-graph` + +Outputs logs of the graphs of the stack and parse trees during parsing, as well as the actual parsing and lexing message. +The graphs are constructed with [graphviz dot][dot], and the output is written to `log.html`. + +### `--wasm` + +Compile and run the parser as a WASM module. + +### `--dot` + +Output the parse tree with [graphviz dot][dot]. + +### `-x/--xml` + +Output the parse tree in XML format. + +### `-c/--cst` + +Output the parse tree in a pretty-printed CST format. + +### `-s/--stat` + +Show parsing statistics. + +### `--timeout ` + +Set the timeout for parsing a single file, in microseconds. + +### `-t/--time` + +Print the time taken to parse the file. If edits are provided, this will also print the time taken to parse the file after +each edit. + +### `-q/--quiet` + +Suppress main output. + +### `--edits ...` + +Apply edits after parsing the file. Edits are in the form of `row, col delcount insert_text` where row and col are 0-indexed. + +### `--encoding ` + +Set the encoding of the input file. By default, the CLI will look for the [`BOM`][bom] to determine if the file is encoded +in `UTF-16BE` or `UTF-16LE`. If no `BOM` is present, `UTF-8` is the default. One of `utf8`, `utf16-le`, `utf16-be`. + +### `--open-log` + +When using the `--debug-graph` option, open the log file in the default browser. + +### `--config-path ` + +The path to an alternative configuration (`config.json`) file. See [the init-config command](./init-config.md) for more information. + +### `-n/--test-number ` + +Parse a specific test in the corpus. The test number is the same number that appears in the output of `tree-sitter test`. + +### `-r/--rebuild` + +Force a rebuild of the parser before running tests. + +### `--no-ranges` + +Omit the node's ranges from the default parse output. This is useful when copying S-Expressions to a test file. + +[dot]: https://graphviz.org/doc/info/lang.html +[bom]: https://en.wikipedia.org/wiki/Byte_order_mark diff --git a/docs/src/cli/playground.md b/docs/src/cli/playground.md new file mode 100644 index 00000000..75ff88e7 --- /dev/null +++ b/docs/src/cli/playground.md @@ -0,0 +1,20 @@ +# `tree-sitter playground` + +The `playground` command allows you to start a local playground to test your parser interactively. + +```bash +tree-sitter playground [OPTIONS] # Aliases: play, pg, web-ui +``` + +Note that you must have already built the parser as a WASM module. This can be done with the [`build`](./build.md) subcommand +(`tree-sitter build --wasm`). + +## Options + +### `-q/--quiet` + +Don't automatically open the playground in the default browser. + +### `--grammar-path ` + +The path to the directory containing the grammar and wasm files. diff --git a/docs/src/cli/query.md b/docs/src/cli/query.md new file mode 100644 index 00000000..48144461 --- /dev/null +++ b/docs/src/cli/query.md @@ -0,0 +1,45 @@ +# `tree-sitter query` + +The `query` command is used to run a query on a parser, and view the results. + +```bash +tree-sitter query [OPTIONS] [PATHS]... # Aliases: q +``` + +## Options + +### `-t/--time` + +Print the time taken to execute the query on the file. + +### `-q/--quiet` + +Suppress main output. + +### `--paths ` + +The path to a file that contains paths to source files in which the query will be executed. + +### `--byte-range ` + +The range of byte offsets in which the query will be executed. The format is `start_byte:end_byte`. + +### `--row-range ` + +The range of rows in which the query will be executed. The format is `start_row:end_row`. + +### `--scope ` + +The language scope to use for parsing and querying. This is useful when the language is ambiguous. + +### `-c/--captures` + +Order the query results by captures instead of matches. + +### `--test` + +Whether to run query tests or not. + +### `--config-path ` + +The path to an alternative configuration (`config.json`) file. See [the init-config command](./init-config.md) for more information. diff --git a/docs/src/cli/tags.md b/docs/src/cli/tags.md new file mode 100644 index 00000000..75751346 --- /dev/null +++ b/docs/src/cli/tags.md @@ -0,0 +1,30 @@ +# `tree-sitter tags` + +You can run symbol tagging on an arbitrary file using `tree-sitter tags`. This will output a list of tags. +For more information, see [the code navigation page](../4-code-navigation.md#tagging-and-captures). + +```bash +tree-sitter tags [OPTIONS] [PATHS]... +``` + +## Options + +### `--scope ` + +The language scope to use for symbol tagging. This is useful when the language is ambiguous. + +### `-t/--time` + +Print the time taken to generate tags for the file. + +### `-q/--quiet` + +Suppress main output. + +### `--paths ` + +The path to a file that contains paths to source files to tag. + +### `--config-path ` + +The path to an alternative configuration (`config.json`) file. See [the init-config command](./init-config.md) for more information. diff --git a/docs/src/cli/test.md b/docs/src/cli/test.md new file mode 100644 index 00000000..187b9643 --- /dev/null +++ b/docs/src/cli/test.md @@ -0,0 +1,68 @@ +# `tree-sitter test` + +The `test` command is used to run the test suite for a parser. + +```bash +tree-sitter test [OPTIONS] # Aliases: t +``` + +## Options + +### `-i/--include ` + +Only run tests whose names match this regex. + +### `-e/--exclude ` + +Skip tests whose names match this regex. + +### `-u/--update` + +Update the expected output of tests. Note that tests containing `ERROR` nodes or `MISSING` nodes will not be updated. + +### `-d/--debug` + +Outputs parsing and lexing logs. This logs to stderr. + +### `-0/--debug-build` + +Compile the parser with debug flags enabled. This is useful when debugging issues that require a debugger like `gdb` or `lldb`. + +### `-D/--debug-graph` + +Outputs logs of the graphs of the stack and parse trees during parsing, as well as the actual parsing and lexing message. +The graphs are constructed with [graphviz dot][dot], and the output is written to `log.html`. + +### `--wasm` + +Compile and run the parser as a WASM module. + +### `--open-log` + +When using the `--debug-graph` option, open the log file in the default browser. + +### `--config-path ` + +The path to an alternative configuration (`config.json`) file. See [the init-config command](./init-config.md) for more information. + +### `--show-fields` + +Force showing fields in test diffs. + +### `--stat ` + +Show parsing statistics when tests are being run. One of `all`, `outliers-and-total`, or `total-only`. + +- `all`: Show statistics for every test. + +- `outliers-and-total`: Show statistics only for outliers, and total statistics. + +- `total-only`: Show only total statistics. + +### `-r/--rebuild` + +Force a rebuild of the parser before running tests. + +### `--overview-only` + +Only show the overview of the test results, and not the diff. diff --git a/docs/src/cli/version.md b/docs/src/cli/version.md new file mode 100644 index 00000000..464c98a4 --- /dev/null +++ b/docs/src/cli/version.md @@ -0,0 +1,24 @@ +# `tree-sitter version` + +The `version` command upgrades the version of your grammar. + +```bash +tree-sitter version # Aliases: publish +``` + +This will update the version in several files, if they exist: + +* tree-sitter.json +* Cargo.toml +* Cargo.lock +* package.json +* package-lock.json +* Makefile +* CMakeLists.txt +* pyproject.toml + +As a grammar author, you should keep the version of your grammar in sync across +different bindings. However, doing so manually is error-prone and tedious, so +this command takes care of the burden. If you are using a version control system, +it is recommended to commit the changes made by this command, and to tag the +commit with the new version. diff --git a/docs/src/creating-parsers/1-getting-started.md b/docs/src/creating-parsers/1-getting-started.md new file mode 100644 index 00000000..0dad6dd4 --- /dev/null +++ b/docs/src/creating-parsers/1-getting-started.md @@ -0,0 +1,132 @@ +# Getting Started + +## Dependencies + +To develop a Tree-sitter parser, there are two dependencies that you need to install: + +- **A JavaScript runtime** — Tree-sitter grammars are written in JavaScript, and Tree-sitter uses a JavaScript runtime +(the default being [Node.js][node.js]) to interpret JavaScript files. It requires this runtime command (default: `node`) +to be in one of the directories in your [`PATH`][path-env]. + +- **A C Compiler** — Tree-sitter creates parsers that are written in C. To run and test these parsers with the +`tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look +for these compilers in the standard places for each platform. + +## Installation + +To create a Tree-sitter parser, you need to use [the `tree-sitter` CLI][tree-sitter-cli]. You can install the CLI in a few +different ways: + +- Build the `tree-sitter-cli` [Rust crate][crate] from source using [`cargo`][cargo], the Rust package manager. This works +on any platform. See [the contributing docs](../6-contributing.md#developing-tree-sitter) for more information. + +- Install the `tree-sitter-cli` [Rust crate][crate] from [crates.io][crates.io] using [`cargo`][cargo]. You can do so by +running the following command: `cargo install tree-sitter-cli --locked` + +- Install the `tree-sitter-cli` [Node.js module][node-module] using [`npm`][npm], the Node package manager. This approach +is fast, but is only works on certain platforms, because it relies on pre-built binaries. + +- Download a binary for your platform from [the latest GitHub release][releases], and put it into a directory on your `PATH`. + +## Project Setup + +The preferred convention is to name the parser repository "tree-sitter-" followed by the name of the language, in lowercase. + +```sh +mkdir tree-sitter-${LOWER_PARSER_NAME} +cd tree-sitter-${LOWER_PARSER_NAME} +``` + +Note that the `LOWER-` prefix here means the "lowercase" name of the language. + +### Init + +Once you've installed the `tree-sitter` CLI tool, you can start setting up your project, which will allow your parser to +be used from multiple languages. + +```sh +# This will prompt you for input +tree-sitter init +``` + +The `init` command will create a bunch of files in the project. +There should be a file called `grammar.js` with the following contents: + +```js +/** + * @file PARSER_DESCRIPTION + * @author PARSER_AUTHOR_NAME PARSER_AUTHOR_EMAIL + * @license PARSER_LICENSE + */ + +/// +// @ts-check + +module.exports = grammar({ + name: 'LOWER_PARSER_NAME', + + rules: { + // TODO: add the actual grammar rules + source_file: $ => 'hello' + } +}); +``` + +Note that the placeholders shown above would be replaced with the corresponding data you provided in the `init` sub-command's +prompts. + +To learn more about this command, check the [reference page](../cli/init.md). + +### Generate + +Next, run the following command: + +```sh +tree-sitter generate +``` + +This will generate the C code required to parse this trivial language. + +You can test this parser by creating a source file with the contents "hello" and parsing it: + +```sh +echo 'hello' > example-file +tree-sitter parse example-file +``` + +Alternatively, in Windows PowerShell: + +```pwsh +"hello" | Out-File example-file -Encoding utf8 +tree-sitter parse example-file +``` + +This should print the following: + +```text +(source_file [0, 0] - [1, 0]) +``` + +You now have a working parser. + +Finally, look back at the [triple-slash][] and [`@ts-check`][ts-check] comments in `grammar.js`; these tell your editor +to provide documentation and type information as you edit your grammar. For these to work, you must download Tree-sitter's +TypeScript API from npm into a `node_modules` directory in your project: + +```sh +npm install # or your package manager of choice +``` + +To learn more about this command, check the [reference page](../cli/generate.md). + +[cargo]: https://doc.rust-lang.org/cargo/getting-started/installation.html +[crate]: https://crates.io/crates/tree-sitter-cli +[crates.io]: https://crates.io/crates/tree-sitter-cli +[node-module]: https://www.npmjs.com/package/tree-sitter-cli +[node.js]: https://nodejs.org +[npm]: https://docs.npmjs.com +[path-env]: https://en.wikipedia.org/wiki/PATH_(variable) +[releases]: https://github.com/tree-sitter/tree-sitter/releases/latest +[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter/tree/master/cli +[triple-slash]: https://www.typescriptlang.org/docs/handbook/triple-slash-directives.html +[ts-check]: https://www.typescriptlang.org/docs/handbook/intro-to-js-ts.html diff --git a/docs/src/creating-parsers/2-the-grammar-dsl.md b/docs/src/creating-parsers/2-the-grammar-dsl.md new file mode 100644 index 00000000..095a425c --- /dev/null +++ b/docs/src/creating-parsers/2-the-grammar-dsl.md @@ -0,0 +1,132 @@ +# The Grammar DSL + +The following is a complete list of built-in functions you can use in your `grammar.js` to define rules. Use-cases for some +of these functions will be explained in more detail in later sections. + +- **Symbols (the `$` object)** — Every grammar rule is written as a JavaScript function that takes a parameter conventionally +called `$`. The syntax `$.identifier` is how you refer to another grammar symbol within a rule. Names starting with `$.MISSING` +or `$.UNEXPECTED` should be avoided as they have special meaning for the `tree-sitter test` command. +- **String and Regex literals** — The terminal symbols in a grammar are described using JavaScript strings and regular +expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; +it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing +regular expressions in your grammar. +- **Regex Limitations** — Only a subset of the Regex engine is actually +supported. This is due to certain features like lookahead and lookaround assertions +not feasible to use in an LR(1) grammar, as well as certain flags being unnecessary +for tree-sitter. However, plenty of features are supported by default: + + - Character classes + - Character ranges + - Character sets + - Quantifiers + - Alternation + - Grouping + - Unicode character escapes + - Unicode property escapes + +- **Sequences : `seq(rule1, rule2, ...)`** — This function creates a rule that matches any number of other rules, one after +another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation][ebnf]. + +- **Alternatives : `choice(rule1, rule2, ...)`** — This function creates a rule that matches *one* of a set of possible +rules. The order of the arguments does not matter. This is analogous to the `|` (pipe) operator in EBNF notation. + +- **Repetitions : `repeat(rule)`** — This function creates a rule that matches *zero-or-more* occurrences of a given rule. +It is analogous to the `{x}` (curly brace) syntax in EBNF notation. + +- **Repetitions : `repeat1(rule)`** — This function creates a rule that matches *one-or-more* occurrences of a given rule. +The previous `repeat` rule is implemented in `repeat1` but is included because it is very commonly used. + +- **Options : `optional(rule)`** — This function creates a rule that matches *zero or one* occurrence of a given rule. +It is analogous to the `[x]` (square bracket) syntax in EBNF notation. + +- **Precedence : `prec(number, rule)`** — This function marks the given rule with a numerical precedence, which will be used +to resolve [*LR(1) Conflicts*][lr-conflict] at parser-generation time. When two rules overlap in a way that represents either +a true ambiguity or a *local* ambiguity given one token of lookahead, Tree-sitter will try to resolve the conflict by matching +the rule with the higher precedence. The default precedence of all rules is zero. This works similarly to the +[precedence directives][yacc-prec] in Yacc grammars. + +- **Left Associativity : `prec.left([number], rule)`** — This function marks the given rule as left-associative (and optionally +applies a numerical precedence). When an LR(1) conflict arises in which all the rules have the same numerical precedence, +Tree-sitter will consult the rules' associativity. If there is a left-associative rule, Tree-sitter will prefer matching +a rule that ends *earlier*. This works similarly to [associativity directives][yacc-prec] in Yacc grammars. + +- **Right Associativity : `prec.right([number], rule)`** — This function is like `prec.left`, but it instructs Tree-sitter +to prefer matching a rule that ends *later*. + +- **Dynamic Precedence : `prec.dynamic(number, rule)`** — This function is similar to `prec`, but the given numerical precedence +is applied at *runtime* instead of at parser generation time. This is only necessary when handling a conflict dynamically +using the `conflicts` field in the grammar, and when there is a genuine *ambiguity*: multiple rules correctly match a given +piece of code. In that event, Tree-sitter compares the total dynamic precedence associated with each rule, and selects the +one with the highest total. This is similar to [dynamic precedence directives][bison-dprec] in Bison grammars. + +- **Tokens : `token(rule)`** — This function marks the given rule as producing only +a single token. Tree-sitter's default is to treat each String or RegExp literal +in the grammar as a separate token. Each token is matched separately by the lexer +and returned as its own leaf node in the tree. The `token` function allows you to +express a complex rule using the functions described above (rather than as a single +regular expression) but still have Tree-sitter treat it as a single token. +The token function will only accept terminal rules, so `token($.foo)` will not work. +You can think of it as a shortcut for squashing complex rules of strings or regexes +down to a single token. + +- **Immediate Tokens : `token.immediate(rule)`** — Usually, whitespace (and any other extras, such as comments) is optional +before each token. This function means that the token will only match if there is no whitespace. + +- **Aliases : `alias(rule, name)`** — This function causes the given rule to *appear* with an alternative name in the syntax +tree. If `name` is a *symbol*, as in `alias($.foo, $.bar)`, then the aliased rule will *appear* as a [named node][named-vs-anonymous-nodes] +called `bar`. And if `name` is a *string literal*, as in `alias($.foo, 'bar')`, then the aliased rule will appear as an +[anonymous node][named-vs-anonymous-nodes], as if the rule had been written as the simple string. + +- **Field Names : `field(name, rule)`** — This function assigns a *field name* to the child node(s) matched by the given +rule. In the resulting syntax tree, you can then use that field name to access specific children. + +- **Reserved Keywords : `reserved(wordset, rule)`** — This function will override the global reserved word set with the +one passed into the `wordset` parameter. This is useful for contextual keywords, such as `if` in JavaScript, which cannot +be used as a variable name in most contexts, but can be used as a property name. + +In addition to the `name` and `rules` fields, grammars have a few other optional public fields that influence the behavior +of the parser. + +- **`extras`** — an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and +comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify +`extras: $ => []` in your grammar. + +- **`inline`** — an array of rule names that should be automatically *removed* from the grammar by replacing all of their +usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* +want to create syntax tree nodes at runtime. + +- **`conflicts`** — an array of arrays of rule names. Each inner array represents a set of rules that's involved in an +*LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use +the GLR algorithm to explore all the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick +the subtree whose corresponding rule has the highest total *dynamic precedence*. + +- **`externals`** — an array of token names which can be returned by an +[*external scanner*][external-scanners]. External scanners allow you to write custom C code which runs during the lexing +process to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. + +- **`precedences`** — an array of arrays of strings, where each array of strings defines named precedence levels in descending +order. These names can be used in the `prec` functions to define precedence relative only to other names in the array, rather +than globally. Can only be used with parse precedence, not lexical precedence. + +- **`word`** — the name of a token that will match keywords to the +[keyword extraction][keyword-extraction] optimization. + +- **`supertypes`** — an array of hidden rule names which should be considered to be 'supertypes' in the generated +[*node types* file][static-node-types]. + +- **`reserved`** — similar in structure to the main `rules` property, an object of reserved word sets associated with an +array of reserved rules. The reserved rule in the array must be a terminal token meaning it must be a string, regex, or token, +or a terminal rule. The *first* reserved word set in the object is the global word set, meaning it applies to every rule +in every parse state. However, certain keywords are contextual, depending on the rule. For example, in JavaScript, keywords +are typically not allowed as ordinary variables, however, they *can* be used as a property name. In this situation, the `reserved` +function would be used, and the word set to pass in would be the name of the word set that is declared in the `reserved` +object that coreesponds an empty array, signifying *no* keywords are reserved. + +[bison-dprec]: https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html +[ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form +[external-scanners]: ./4-external-scanners.md +[keyword-extraction]: ./3-writing-the-grammar.md#keyword-extraction +[lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables +[named-vs-anonymous-nodes]: ../using-parsers/2-basic-parsing.md#named-vs-anonymous-nodes +[static-node-types]: ../using-parsers/6-static-node-types.md +[yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html diff --git a/docs/src/creating-parsers/3-writing-the-grammar.md b/docs/src/creating-parsers/3-writing-the-grammar.md new file mode 100644 index 00000000..f86cacd6 --- /dev/null +++ b/docs/src/creating-parsers/3-writing-the-grammar.md @@ -0,0 +1,446 @@ +# Writing the Grammar + +Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe +any given language. To produce a good Tree-sitter parser, you need to create a grammar with two important properties: + +1. **An intuitive structure** — Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds +directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So to produce an easy-to-analyze tree, there +should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. +This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts +like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers. + +2. **A close adherence to LR(1)** — Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while +it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. +In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, +but _different_ from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] +commonly used in language specifications. + +It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly +into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. +The following sections will explain these adjustments in more depth. + +## The First Few Rules + +It's usually a good idea to find a formal specification for the language you're trying to parse. This specification will +most likely contain a context-free grammar. As you read through the rules of this CFG, you will probably discover a complex +and cyclic graph of relationships. It might be unclear how you should navigate this graph as you define your grammar. + +Although languages have very different constructs, their constructs can often be categorized in to similar groups like +_Declarations_, _Definitions_, _Statements_, _Expressions_, _Types_ and _Patterns_. In writing your grammar, a good first +step is to create just enough structure to include all of these basic _groups_ of symbols. For a language like Go, +you might start with something like this: + +```js +{ + // ... + + rules: { + source_file: $ => repeat($._definition), + + _definition: $ => choice( + $.function_definition + // TODO: other kinds of definitions + ), + + function_definition: $ => seq( + 'func', + $.identifier, + $.parameter_list, + $._type, + $.block + ), + + parameter_list: $ => seq( + '(', + // TODO: parameters + ')' + ), + + _type: $ => choice( + 'bool' + // TODO: other kinds of types + ), + + block: $ => seq( + '{', + repeat($._statement), + '}' + ), + + _statement: $ => choice( + $.return_statement + // TODO: other kinds of statements + ), + + return_statement: $ => seq( + 'return', + $._expression, + ';' + ), + + _expression: $ => choice( + $.identifier, + $.number + // TODO: other kinds of expressions + ), + + identifier: $ => /[a-z]+/, + + number: $ => /\d+/ + } +} +``` + +One important fact to know up front is that the start rule for the grammar is the first property in the `rules` object. +In the example above, that would correspond to `source_file`, but it can be named anything. + +Some details of this grammar will be explained in more depth later on, but if you focus on the `TODO` comments, you can +see that the overall strategy is _breadth-first_. Notably, this initial skeleton does not need to directly match an exact +subset of the context-free grammar in the language specification. It just needs to touch on the major groupings of rules +in as simple and obvious a way as possible. + +With this structure in place, you can now freely decide what part of the grammar to flesh out next. For example, you might +decide to start with _types_. One-by-one, you could define the rules for writing basic types and composing them into more +complex types: + +```js +{ + // ... + + _type: $ => choice( + $.primitive_type, + $.array_type, + $.pointer_type + ), + + primitive_type: $ => choice( + 'bool', + 'int' + ), + + array_type: $ => seq( + '[', + ']', + $._type + ), + + pointer_type: $ => seq( + '*', + $._type + ) +} +``` + +After developing the _type_ sublanguage a bit further, you might decide to switch to working on _statements_ or _expressions_ +instead. It's often useful to check your progress by trying to parse some real code using `tree-sitter parse`. + +**And remember to add tests for each rule in your `test/corpus` folder!** + +## Structuring Rules Well + +Imagine that you were just starting work on the [Tree-sitter JavaScript parser][tree-sitter-javascript]. Naively, you might +try to directly mirror the structure of the [ECMAScript Language Spec][ecmascript-spec]. To illustrate the problem with this +approach, consider the following line of code: + +```js +return x + y; +``` + +According to the specification, this line is a `ReturnStatement`, the fragment `x + y` is an `AdditiveExpression`, +and `x` and `y` are both `IdentifierReferences`. The relationship between these constructs is captured by a complex series +of production rules: + +```text +ReturnStatement -> 'return' Expression +Expression -> AssignmentExpression +AssignmentExpression -> ConditionalExpression +ConditionalExpression -> LogicalORExpression +LogicalORExpression -> LogicalANDExpression +LogicalANDExpression -> BitwiseORExpression +BitwiseORExpression -> BitwiseXORExpression +BitwiseXORExpression -> BitwiseANDExpression +BitwiseANDExpression -> EqualityExpression +EqualityExpression -> RelationalExpression +RelationalExpression -> ShiftExpression +ShiftExpression -> AdditiveExpression +AdditiveExpression -> MultiplicativeExpression +MultiplicativeExpression -> ExponentiationExpression +ExponentiationExpression -> UnaryExpression +UnaryExpression -> UpdateExpression +UpdateExpression -> LeftHandSideExpression +LeftHandSideExpression -> NewExpression +NewExpression -> MemberExpression +MemberExpression -> PrimaryExpression +PrimaryExpression -> IdentifierReference +``` + +The language spec encodes the twenty different precedence levels of JavaScript expressions using twenty levels of indirection +between `IdentifierReference` and `Expression`. If we were to create a concrete syntax tree representing this statement +according to the language spec, it would have twenty levels of nesting, and it would contain nodes with names like `BitwiseXORExpression`, +which are unrelated to the actual code. + +## Using Precedence + +To produce a readable syntax tree, we'd like to model JavaScript expressions using a much flatter structure like this: + +```js +{ + // ... + + _expression: $ => choice( + $.identifier, + $.unary_expression, + $.binary_expression, + // ... + ), + + unary_expression: $ => choice( + seq('-', $._expression), + seq('!', $._expression), + // ... + ), + + binary_expression: $ => choice( + seq($._expression, '*', $._expression), + seq($._expression, '+', $._expression), + // ... + ), +} +``` + +Of course, this flat structure is highly ambiguous. If we try to generate a parser, Tree-sitter gives us an error message: + +```text +Error: Unresolved conflict for symbol sequence: + + '-' _expression • '*' … + +Possible interpretations: + + 1: '-' (binary_expression _expression • '*' _expression) + 2: (unary_expression '-' _expression) • '*' … + +Possible resolutions: + + 1: Specify a higher precedence in `binary_expression` than in the other rules. + 2: Specify a higher precedence in `unary_expression` than in the other rules. + 3: Specify a left or right associativity in `unary_expression` + 4: Add a conflict for these rules: `binary_expression` `unary_expression` +``` + +
+The • character in the error message indicates where exactly during +parsing the conflict occurs, or in other words, where the parser is encountering +ambiguity. +
+ +For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This +is where the `prec` function [described in the previous page][grammar dsl] comes into play. By wrapping a rule with `prec`, +we can indicate that certain sequence of symbols should _bind to each other more tightly_ than others. For example, the +`'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` +sequence in `binary_expression`: + +```js +{ + // ... + + unary_expression: $ => + prec( + 2, + choice( + seq("-", $._expression), + seq("!", $._expression), + // ... + ), + ); +} +``` + +## Using Associativity + +Applying a higher precedence in `unary_expression` fixes that conflict, but there is still another conflict: + +```text +Error: Unresolved conflict for symbol sequence: + + _expression '*' _expression • '*' … + +Possible interpretations: + + 1: _expression '*' (binary_expression _expression • '*' _expression) + 2: (binary_expression _expression '*' _expression) • '*' … + +Possible resolutions: + + 1: Specify a left or right associativity in `binary_expression` + 2: Add a conflict for these rules: `binary_expression` +``` + +For an expression like `a * b * c`, it's not clear whether we mean `a * (b * c)` or `(a * b) * c`. +This is where `prec.left` and `prec.right` come into use. We want to select the second interpretation, so we use `prec.left`. + +```js +{ + // ... + + binary_expression: $ => choice( + prec.left(2, seq($._expression, '*', $._expression)), + prec.left(1, seq($._expression, '+', $._expression)), + // ... + ), +} +``` + +## Hiding Rules + +You may have noticed in the above examples that some grammar rule name like `_expression` and `_type` began with an underscore. +Starting a rule's name with an underscore causes the rule to be _hidden_ in the syntax tree. This is useful for rules like +`_expression` in the grammars above, which always just wrap a single child node. If these nodes were not hidden, they would +add substantial depth and noise to the syntax tree without making it any easier to understand. + +## Using Fields + +Often, it's easier to analyze a syntax node if you can refer to its children by _name_ instead of by their position in an +ordered list. Tree-sitter grammars support this using the `field` function. This function allows you to assign unique names +to some or all of a node's children: + +```js +function_definition: $ => + seq( + "func", + field("name", $.identifier), + field("parameters", $.parameter_list), + field("return_type", $._type), + field("body", $.block), + ); +``` + +Adding fields like this allows you to retrieve nodes using the [field APIs][field-names-section]. + +# Lexical Analysis + +Tree-sitter's parsing process is divided into two phases: parsing (which is described above) and [lexing][lexing] — the +process of grouping individual characters into the language's fundamental _tokens_. There are a few important things to +know about how Tree-sitter's lexing works. + +## Conflicting Tokens + +Grammars often contain multiple tokens that can match the same characters. For example, a grammar might contain the tokens +(`"if"` and `/[a-z]+/`). Tree-sitter differentiates between these conflicting tokens in a few ways. + +1. **Context-aware Lexing** — Tree-sitter performs lexing on-demand, during the parsing process. At any given position +in a source document, the lexer only tries to recognize tokens that are _valid_ at that position in the document. + +2. **Lexical Precedence** — When the precedence functions described [in the previous page][grammar dsl] are used _within_ +the `token` function, the given explicit precedence values serve as instructions to the lexer. If there are two valid tokens +that match the characters at a given position in the document, Tree-sitter will select the one with the higher precedence. + +3. **Match Length** — If multiple valid tokens with the same precedence match the characters at a given position in a document, +Tree-sitter will select the token that matches the [longest sequence of characters][longest-match]. + +4. **Match Specificity** — If there are two valid tokens with the same precedence, and they both match the same number +of characters, Tree-sitter will prefer a token that is specified in the grammar as a `String` over a token specified as +a `RegExp`. + +5. **Rule Order** — If none of the above criteria can be used to select one token over another, Tree-sitter will prefer +the token that appears earlier in the grammar. + +If there is an external scanner it may have [an additional impact][external scanner] over regular tokens +defined in the grammar. + +## Lexical Precedence vs. Parse Precedence + +One common mistake involves not distinguishing _lexical precedence_ from _parse precedence_. Parse precedence determines +which rule is chosen to interpret a given sequence of tokens. _Lexical precedence_ determines which token is chosen to interpret +at a given position of text, and it is a lower-level operation that is done first. The above list fully captures Tree-sitter's +lexical precedence rules, and you will probably refer back to this section of the documentation more often than any other. +Most of the time when you really get stuck, you're dealing with a lexical precedence problem. Pay particular attention to +the difference in meaning between using `prec` inside the `token` function versus outside it. The _lexical precedence_ syntax +is `token(prec(N, ...))`. + +## Keywords + +Many languages have a set of _keyword_ tokens (e.g. `if`, `for`, `return`), as well as a more general token (e.g. `identifier`) +that matches any word, including many of the keyword strings. For example, JavaScript has a keyword `instanceof`, which is +used as a binary operator, like this: + +```js +if (a instanceof Something) b(); +``` + +The following, however, is not valid JavaScript: + +```js +if (a instanceofSomething) b(); +``` + +A keyword like `instanceof` cannot be followed immediately by another letter, because then it would be tokenized as an `identifier`, +**even though an identifier is not valid at that position**. Because Tree-sitter uses context-aware lexing, as described +[above](#conflicting-tokens), it would not normally impose this restriction. By default, Tree-sitter would recognize `instanceofSomething` +as two separate tokens: the `instanceof` keyword followed by an `identifier`. + +## Keyword Extraction + +Fortunately, Tree-sitter has a feature that allows you to fix this, so that you can match the behavior of other standard +parsers: the `word` token. If you specify a `word` token in your grammar, Tree-sitter will find the set of _keyword_ tokens +that match strings also matched by the `word` token. Then, during lexing, instead of matching each of these keywords individually, +Tree-sitter will match the keywords via a two-step process where it _first_ matches the `word` token. + +For example, suppose we added `identifier` as the `word` token in our JavaScript grammar: + +```js +grammar({ + name: "javascript", + + word: $ => $.identifier, + + rules: { + _expression: $ => + choice( + $.identifier, + $.unary_expression, + $.binary_expression, + // ... + ), + + binary_expression: $ => + choice( + prec.left(1, seq($._expression, "instanceof", $._expression)), + // ... + ), + + unary_expression: $ => + choice( + prec.left(2, seq("typeof", $._expression)), + // ... + ), + + identifier: $ => /[a-z_]+/, + }, +}); +``` + +Tree-sitter would identify `typeof` and `instanceof` as keywords. Then, when parsing the invalid code above, rather than +scanning for the `instanceof` token individually, it would scan for an `identifier` first, and find `instanceofSomething`. +It would then correctly recognize the code as invalid. + +Aside from improving error detection, keyword extraction also has performance benefits. It allows Tree-sitter to generate +a smaller, simpler lexing function, which means that **the parser will compile much more quickly**. + +[ambiguous-grammar]: https://en.wikipedia.org/wiki/Ambiguous_grammar +[antlr]: https://www.antlr.org +[bison]: https://en.wikipedia.org/wiki/GNU_bison +[cst]: https://en.wikipedia.org/wiki/Parse_tree +[ecmascript-spec]: https://262.ecma-international.org/6.0/ +[external scanner]: ./4-external-scanners.md#other-external-scanner-details +[glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser +[grammar dsl]: ./2-the-grammar-dsl.md +[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification +[lexing]: https://en.wikipedia.org/wiki/Lexical_analysis +[longest-match]: https://en.wikipedia.org/wiki/Maximal_munch +[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser +[field-names-section]: ../using-parsers/2-basic-parsing.md#node-field-names +[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols +[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar +[tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript +[yacc]: https://en.wikipedia.org/wiki/Yacc diff --git a/docs/src/creating-parsers/4-external-scanners.md b/docs/src/creating-parsers/4-external-scanners.md new file mode 100644 index 00000000..13e622e9 --- /dev/null +++ b/docs/src/creating-parsers/4-external-scanners.md @@ -0,0 +1,376 @@ +# External Scanners + +Many languages have some tokens whose structure is impossible or inconvenient to describe with a regular expression. +Some examples: + +- [Indent and dedent][indent-tokens] tokens in Python +- [Heredocs][heredoc] in Bash and Ruby +- [Percent strings][percent-string] in Ruby + +Tree-sitter allows you to handle these kinds of tokens using _external scanners_. An external scanner is a set of C functions +that you, the grammar author, can write by hand to add custom logic for recognizing certain tokens. + +To use an external scanner, there are a few steps. First, add an `externals` section to your grammar. This section should +list the names of all of your external tokens. These names can then be used elsewhere in your grammar. + +```js +grammar({ + name: "my_language", + + externals: $ => [$.indent, $.dedent, $.newline], + + // ... +}); +``` + +Then, add another C source file to your project. Its path must be src/scanner.c for the CLI to recognize it. Be sure to add +this file to the sources section of your `binding.gyp` file so that it will be included when your project is compiled by +Node.js and uncomment the appropriate block in your bindings/rust/build.rs file so that it will be included in your Rust +crate. + +In this new source file, define an [`enum`][enum] type containing the names of all of your external tokens. The ordering +of this enum must match the order in your grammar's `externals` array; the actual names do not matter. + +```c +#include "tree_sitter/parser.h" +#include "tree_sitter/alloc.h" +#include "tree_sitter/array.h" + +enum TokenType { + INDENT, + DEDENT, + NEWLINE +} +``` + +Finally, you must define five functions with specific names, based on your language's name and five actions: +_create_, _destroy_, _serialize_, _deserialize_, and _scan_. + +## Create + +```c +void * tree_sitter_my_language_external_scanner_create() { + // ... +} +``` + +This function should create your scanner object. It will only be called once anytime your language is set on a parser. +Often, you will want to allocate memory on the heap and return a pointer to it. If your external scanner doesn't need to +maintain any state, it's ok to return `NULL`. + +## Destroy + +```c +void tree_sitter_my_language_external_scanner_destroy(void *payload) { + // ... +} +``` + +This function should free any memory used by your scanner. It is called once when a parser is deleted or assigned a different +language. It receives as an argument the same pointer that was returned from the _create_ function. If your _create_ function +didn't allocate any memory, this function can be a noop. + +## Serialize + +```c +unsigned tree_sitter_my_language_external_scanner_serialize( + void *payload, + char *buffer +) { + // ... +} +``` + +This function should copy the complete state of your scanner into a given byte buffer, and return the number of bytes written. +The function is called every time the external scanner successfully recognizes a token. It receives a pointer to your scanner +and a pointer to a buffer. The maximum number of bytes that you can write is given by the `TREE_SITTER_SERIALIZATION_BUFFER_SIZE` +constant, defined in the `tree_sitter/parser.h` header file. + +The data that this function writes will ultimately be stored in the syntax tree so that the scanner can be restored to the +right state when handling edits or ambiguities. For your parser to work correctly, the `serialize` function must store its +entire state, and `deserialize` must restore the entire state. For good performance, you should design your scanner so that +its state can be serialized as quickly and compactly as possible. + +## Deserialize + +```c +void tree_sitter_my_language_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) { + // ... +} +``` + +This function should _restore_ the state of your scanner based the bytes that were previously written by the `serialize` +function. It is called with a pointer to your scanner, a pointer to the buffer of bytes, and the number of bytes that should +be read. It is good practice to explicitly erase your scanner state variables at the start of this function, before restoring +their values from the byte buffer. + +## Scan + +```c +bool tree_sitter_my_language_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + // ... +} +``` + +This function is responsible for recognizing external tokens. It should return `true` if a token was recognized, and `false` +otherwise. It is called with a "lexer" struct with the following fields: + +- **`int32_t lookahead`** — The current next character in the input stream, represented as a 32-bit unicode code point. + +- **`TSSymbol result_symbol`** — The symbol that was recognized. Your scan function should _assign_ to this field one of +the values from the `TokenType` enum, described above. + +- **`void (*advance)(TSLexer *, bool skip)`** — A function for advancing to the next character. If you pass `true` for +the second argument, the current character will be treated as whitespace; whitespace won't be included in the text range +associated with tokens emitted by the external scanner. + +- **`void (*mark_end)(TSLexer *)`** — A function for marking the end of the recognized token. This allows matching tokens +that require multiple characters of lookahead. By default, (if you don't call `mark_end`), any character that you moved past +using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls +to `advance` will _not_ increase the size of the returned token. You can call `mark_end` multiple times to increase the size +of the token. + +- **`uint32_t (*get_column)(TSLexer *)`** — A function for querying the current column position of the lexer. It returns +the number of codepoints since the start of the current line. The codepoint position is recalculated on every call to this +function by reading from the start of the line. + +- **`bool (*is_at_included_range_start)(const TSLexer *)`** — A function for checking whether the parser has just skipped +some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function +(described in the [multi-language document section][multi-language-section]), the scanner may want to apply some special +behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses +this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. + +- **`bool (*eof)(const TSLexer *)`** — A function for determining whether the lexer is at the end of the file. The value +of `lookahead` will be `0` at the end of a file, but this function should be used instead of checking for that value because +the `0` or "NUL" value is also a valid character that could be present in the file being parsed. + +The third argument to the `scan` function is an array of booleans that indicates which of external tokens are expected by +the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot +backtrack, so you may need to combine certain pieces of logic. + +```c +if (valid_symbols[INDENT] || valid_symbols[DEDENT]) { + + // ... logic that is common to both `INDENT` and `DEDENT` + + if (valid_symbols[INDENT]) { + + // ... logic that is specific to `INDENT` + + lexer->result_symbol = INDENT; + return true; + } +} +``` + +## External Scanner Helpers + +### Allocator + +Instead of using libc's `malloc`, `calloc`, `realloc`, and `free`, you should use the versions prefixed with `ts_` from `tree_sitter/alloc.h`. +These macros can allow a potential consumer to override the default allocator with their own implementation, but by default +will use the libc functions. + +As a consumer of the tree-sitter core library as well as any parser libraries that might use allocations, you can enable +overriding the default allocator and have it use the same one as the library allocator, of which you can set with `ts_set_allocator`. +To enable this overriding in scanners, you must compile them with the `TREE_SITTER_REUSE_ALLOCATOR` macro defined, and tree-sitter +the library must be linked into your final app dynamically, since it needs to resolve the internal functions at runtime. +If you are compiling an executable binary that uses the core library, but want to load parsers dynamically at runtime, then +you will have to use a special linker flag on Unix. For non-Darwin systems, that would be `--dynamic-list` and for Darwin +systems, that would be `-exported_symbols_list`. The CLI does exactly this, so you can use it as a reference (check out `cli/build.rs`). + +For example, assuming you wanted to allocate 100 bytes for your scanner, you'd do so like the following example: + +```c +#include "tree_sitter/parser.h" +#include "tree_sitter/alloc.h" + +// ... + +void* tree_sitter_my_language_external_scanner_create() { + return ts_calloc(100, 1); // or ts_malloc(100) +} + +// ... + +``` + +### Arrays + +If you need to use array-like types in your scanner, such as tracking a stack of indentations or tags, you should use the +array macros from `tree_sitter/array.h`. + +There are quite a few of them provided for you, but here's how you could get started tracking some . Check out the header +itself for more detailed documentation. + +
+Do not use any of the array functions or macros that are prefixed with an underscore and have comments saying +that it is not what you are looking for. These are internal functions used as helpers by other macros that are public. +They are not meant to be used directly, nor are they what you want. +
+ +```c +#include "tree_sitter/parser.h" +#include "tree_sitter/array.h" + +enum TokenType { + INDENT, + DEDENT, + NEWLINE, + STRING, +} + +// Create the array in your create function + +void* tree_sitter_my_language_external_scanner_create() { + return ts_calloc(1, sizeof(Array(int))); + + // or if you want to zero out the memory yourself + + Array(int) *stack = ts_malloc(sizeof(Array(int))); + array_init(&stack); + return stack; +} + +bool tree_sitter_my_language_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + Array(int) *stack = payload; + if (valid_symbols[INDENT]) { + array_push(stack, lexer->get_column(lexer)); + lexer->result_symbol = INDENT; + return true; + } + if (valid_symbols[DEDENT]) { + array_pop(stack); // this returns the popped element by value, but we don't need it + lexer->result_symbol = DEDENT; + return true; + } + + // we can also use an array on the stack to keep track of a string + + Array(char) next_string = array_new(); + + if (valid_symbols[STRING] && lexer->lookahead == '"') { + lexer->advance(lexer, false); + while (lexer->lookahead != '"' && lexer->lookahead != '\n' && !lexer->eof(lexer)) { + array_push(&next_string, lexer->lookahead); + lexer->advance(lexer, false); + } + + // assume we have some arbitrary constraint of not having more than 100 characters in a string + if (lexer->lookahead == '"' && next_string.size <= 100) { + lexer->advance(lexer, false); + lexer->result_symbol = STRING; + return true; + } + } + + return false; +} + +``` + +## Other External Scanner Details + +External scanners have priority over Tree-sitter's normal lexing process. When a token listed in the externals array is valid +at a given position, the external scanner is called first. This makes external scanners a powerful way to override Tree-sitter's +default lexing behavior, especially for cases that can't be handled with regular lexical rules, parsing, or dynamic precedence. + +During error recovery, Tree-sitter's first step is to call the external scanner's scan function with all tokens marked as +valid. Your scanner should detect and handle this case appropriately. One simple approach is to add an unused "sentinel" +token at the end of your externals array: + +```js +{ + name: "my_language", + + externals: $ => [$.token1, $.token2, $.error_sentinel] + + // ... +} +``` + +You can then check if this sentinel token is marked valid to determine if Tree-sitter is in error recovery mode. + +If you would rather not handle the error recovery case explicitly, the easiest way to "opt-out" and let tree-sitter's internal +lexer handle it is to return `false` from your scan function when `valid_symbols` contains the error sentinel. + +```c +bool tree_sitter_my_language_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + if (valid_symbols[ERROR_SENTINEL]) { + return false; + } + // ... +} +``` + +When you include literal keywords in the externals array, for example: + +```js +externals: $ => ['if', 'then', 'else'] +``` + +_those_ keywords will +be tokenized by the external scanner whenever they appear in the grammar. + +This is equivalent to declaring named tokens and aliasing them: + +```js +{ + name: "my_language", + + externals: $ => [$.if_keyword, $.then_keyword, $.else_keyword], + + rules: { + + // then using it in a rule like so: + if_statement: $ => seq(alias($.if_keyword, 'if'), ...), + + // ... + } +} +``` + +The tokenization process for external keywords works in two stages: + +1. The external scanner attempts to recognize the token first +2. If the scanner returns true and sets a token, that token is used +3. If the scanner returns false, Tree-sitter falls back to its internal lexer + +However, when you use rule references (like `$.if_keyword`) in the externals array without defining the corresponding rules +in the grammar, Tree-sitter cannot fall back to its internal lexer. In this case, the external scanner is solely responsible +for recognizing these tokens. + +
+ +**Important Warnings** + +⚠️ External scanners can easily create infinite loops + +⚠️ Be extremely careful when emitting zero-width tokens + +⚠️ Always use the `eof` function when looping through characters + +
+ +[ejs]: https://ejs.co +[enum]: https://en.wikipedia.org/wiki/Enumerated_type#C +[heredoc]: https://en.wikipedia.org/wiki/Here_document +[indent-tokens]: https://en.wikipedia.org/wiki/Off-side_rule +[multi-language-section]: ../using-parsers/3-advanced-parsing.md#multi-language-documents +[percent-string]: https://docs.ruby-lang.org/en/2.5.0/doc/syntax/literals_rdoc.html#label-Percent+Strings diff --git a/docs/src/creating-parsers/5-writing-tests.md b/docs/src/creating-parsers/5-writing-tests.md new file mode 100644 index 00000000..f8c1ac91 --- /dev/null +++ b/docs/src/creating-parsers/5-writing-tests.md @@ -0,0 +1,163 @@ +# Writing Tests + +For each rule that you add to the grammar, you should first create a *test* that describes how the syntax trees should look +when parsing that rule. These tests are written using specially-formatted text files in the `test/corpus/` directory within +your parser's root folder. + +For example, you might have a file called `test/corpus/statements.txt` that contains a series of entries like this: + +```text +================== +Return statements +================== + +func x() int { + return 1; +} + +--- + +(source_file + (function_definition + (identifier) + (parameter_list) + (primitive_type) + (block + (return_statement (number))))) +``` + +* The **name** of each test is written between two lines containing only `=` (equal sign) characters. + +* Then the **input source code** is written, followed by a line containing three or more `-` (dash) characters. + +* Then, the **expected output syntax tree** is written as an [S-expression][s-exp]. The exact placement of whitespace in +the S-expression doesn't matter, but ideally the syntax tree should be legible. Note that the S-expression does not show +syntax nodes like `func`, `(` and `;`, which are expressed as strings and regexes in the grammar. It only shows the *named* +nodes, as described in [this section][named-vs-anonymous-nodes] of the page on parser usage. + + The expected output section can also *optionally* show the [*field names*][node-field-names] associated with each child + node. To include field names in your tests, you write a node's field name followed by a colon, before the node itself in + the S-expression: + +```query +(source_file + (function_definition + name: (identifier) + parameters: (parameter_list) + result: (primitive_type) + body: (block + (return_statement (number))))) +``` + +* If your language's syntax conflicts with the `===` and `---` test separators, you can optionally add an arbitrary identical +suffix (in the below example, `|||`) to disambiguate them: + +```text +==================||| +Basic module +==================||| + +---- MODULE Test ---- +increment(n) == n + 1 +==== + +---||| + +(source_file + (module (identifier) + (operator (identifier) + (parameter_list (identifier)) + (plus (identifier_ref) (number))))) +``` + +These tests are important. They serve as the parser's API documentation, and they can be run every time you change the grammar +to verify that everything still parses correctly. + +By default, the `tree-sitter test` command runs all the tests in your `test/corpus/` folder. To run a particular test, you +can use the `-f` flag: + +```sh +tree-sitter test -f 'Return statements' +``` + +The recommendation is to be comprehensive in adding tests. If it's a visible node, add it to a test file in your `test/corpus` +directory. It's typically a good idea to test all the permutations of each language construct. This increases test coverage, +but doubly acquaints readers with a way to examine expected outputs and understand the "edges" of a language. + +## Attributes + +Tests can be annotated with a few `attributes`. Attributes must be put in the header, below the test name, and start with +a `:`. A couple of attributes also take in a parameter, which require the use of parenthesis. + +**Note**: If you'd like to supply in multiple parameters, e.g. to run tests on multiple platforms or to test multiple languages, +you can repeat the attribute on a new line. + +The following attributes are available: + +* `:skip` — This attribute will skip the test when running `tree-sitter test`. + This is useful when you want to temporarily disable running a test without deleting it. +* `:error` — This attribute will assert that the parse tree contains an error. It's useful to just validate that a certain +input is invalid without displaying the whole parse tree, as such you should omit the parse tree below the `---` line. +* `:fail-fast` — This attribute will stop the testing additional tests if the test marked with this attribute fails. +* `:language(LANG)` — This attribute will run the tests using the parser for the specified language. This is useful for +multi-parser repos, such as XML and DTD, or Typescript and TSX. The default parser used will always be the first entry in +the `grammars` field in the `tree-sitter.json` config file, so having a way to pick a second or even third parser is useful. +* `:platform(PLATFORM)` — This attribute specifies the platform on which the test should run. It is useful to test platform-specific +behavior (e.g. Windows newlines are different from Unix). This attribute must match up with Rust's [`std::env::consts::OS`][constants]. + +Examples using attributes: + +```text +========================= +Test that will be skipped +:skip +========================= + +int main() {} + +------------------------- + +==================================== +Test that will run on Linux or macOS + +:platform(linux) +:platform(macos) +==================================== + +int main() {} + +------------------------------------ + +======================================================================== +Test that expects an error, and will fail fast if there's no parse error +:fail-fast +:error +======================================================================== + +int main ( {} + +------------------------------------------------------------------------ + +================================================= +Test that will parse with both Typescript and TSX +:language(typescript) +:language(tsx) +================================================= + +console.log('Hello, world!'); + +------------------------------------------------- +``` + +### Automatic Compilation + +You might notice that the first time you run `tree-sitter test` after regenerating your parser, it takes some extra time. +This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. It recompiles your parser +as-needed whenever you update it by re-running `tree-sitter generate`, or whenever the [external scanner][external-scanners] +file is changed. + +[constants]: https://doc.rust-lang.org/std/env/consts/constant.OS.html +[external-scanners]: ./4-external-scanners.md +[named-vs-anonymous-nodes]: ../using-parsers/2-basic-parsing.md#named-vs-anonymous-nodes +[node-field-names]: ../using-parsers/2-basic-parsing.md#node-field-names +[s-exp]: https://en.wikipedia.org/wiki/S-expression diff --git a/docs/src/creating-parsers/index.md b/docs/src/creating-parsers/index.md new file mode 100644 index 00000000..478cbeeb --- /dev/null +++ b/docs/src/creating-parsers/index.md @@ -0,0 +1,4 @@ +# Creating parsers + +Developing Tree-sitter grammars can have a difficult learning curve, but once you get the hang of it, it can be fun and even +zen-like. This document will help you to get started and to develop a useful mental model. diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 00000000..9689d1df --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,91 @@ +# Introduction + +Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be: + +- **General** enough to parse any programming language +- **Fast** enough to parse on every keystroke in a text editor +- **Robust** enough to provide useful results even in the presence of syntax errors +- **Dependency-free** so that the runtime library (which is written in pure [C11](https://github.com/tree-sitter/tree-sitter/tree/master/lib)) can be embedded in any application + +### Language Bindings + +There are currently bindings that allow Tree-sitter to be used from the following languages: + +#### Official + +- [C#](https://github.com/tree-sitter/csharp-tree-sitter) +- [Go](https://github.com/tree-sitter/go-tree-sitter) +- [Haskell](https://github.com/tree-sitter/haskell-tree-sitter) +- [Java (JDK 22)](https://github.com/tree-sitter/java-tree-sitter) +- [JavaScript (Node.js)](https://github.com/tree-sitter/node-tree-sitter) +- [JavaScript (Wasm)](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) +- [Kotlin](https://github.com/tree-sitter/kotlin-tree-sitter) +- [Python](https://github.com/tree-sitter/py-tree-sitter) +- [Rust](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) +- [Zig](https://github.com/tree-sitter/zig-tree-sitter) + +#### Third-party + +- [Delphi](https://github.com/modersohn/delphi-tree-sitter) +- [ELisp](https://www.gnu.org/software/emacs/manual/html_node/elisp/Parsing-Program-Source.html) +- [Guile](https://github.com/Z572/guile-ts) +- [Java (JDK 8+)](https://github.com/bonede/tree-sitter-ng) +- [Java (JDK 11+)](https://github.com/seart-group/java-tree-sitter) +- [Julia](https://github.com/MichaelHatherly/TreeSitter.jl) +- [Lua](https://github.com/euclidianAce/ltreesitter) +- [Lua](https://github.com/xcb-xwii/lua-tree-sitter) +- [OCaml](https://github.com/returntocorp/ocaml-tree-sitter-core) +- [Odin](https://github.com/laytan/odin-tree-sitter) +- [Perl](https://metacpan.org/pod/Text::Treesitter) +- [R](https://github.com/DavisVaughan/r-tree-sitter) +- [Ruby](https://github.com/Faveod/ruby-tree-sitter) +- [Ruby](https://github.com/calicoday/ruby-tree-sitter-ffi) +- [Swift](https://github.com/ChimeHQ/SwiftTreeSitter) + +### Parsers + +The following parsers can be found in the upstream organization: + +- [Agda](https://github.com/tree-sitter/tree-sitter-agda) +- [Bash](https://github.com/tree-sitter/tree-sitter-bash) +- [C](https://github.com/tree-sitter/tree-sitter-c) +- [C++](https://github.com/tree-sitter/tree-sitter-cpp) +- [C#](https://github.com/tree-sitter/tree-sitter-c-sharp) +- [CSS](https://github.com/tree-sitter/tree-sitter-css) +- [ERB / EJS](https://github.com/tree-sitter/tree-sitter-embedded-template) +- [Go](https://github.com/tree-sitter/tree-sitter-go) +- [Haskell](https://github.com/tree-sitter/tree-sitter-haskell) +- [HTML](https://github.com/tree-sitter/tree-sitter-html) +- [Java](https://github.com/tree-sitter/tree-sitter-java) +- [JavaScript](https://github.com/tree-sitter/tree-sitter-javascript) +- [JSDoc](https://github.com/tree-sitter/tree-sitter-jsdoc) +- [JSON](https://github.com/tree-sitter/tree-sitter-json) +- [Julia](https://github.com/tree-sitter/tree-sitter-julia) +- [OCaml](https://github.com/tree-sitter/tree-sitter-ocaml) +- [PHP](https://github.com/tree-sitter/tree-sitter-php) +- [Python](https://github.com/tree-sitter/tree-sitter-python) +- [Regex](https://github.com/tree-sitter/tree-sitter-regex) +- [Ruby](https://github.com/tree-sitter/tree-sitter-ruby) +- [Rust](https://github.com/tree-sitter/tree-sitter-rust) +- [Scala](https://github.com/tree-sitter/tree-sitter-scala) +- [TypeScript](https://github.com/tree-sitter/tree-sitter-typescript) +- [Verilog](https://github.com/tree-sitter/tree-sitter-verilog) + +A list of known parsers can be found in the [wiki](https://github.com/tree-sitter/tree-sitter/wiki/List-of-parsers). + +### Talks on Tree-sitter + +- [Strange Loop 2018](https://www.thestrangeloop.com/2018/tree-sitter---a-new-parsing-system-for-programming-tools.html) +- [FOSDEM 2018](https://www.youtube.com/watch?v=0CGzC_iss-8) +- [GitHub Universe 2017](https://www.youtube.com/watch?v=a1rC79DHpmY) + +### Underlying Research + +The design of Tree-sitter was greatly influenced by the following research papers: + +- [Practical Algorithms for Incremental Software Development Environments](https://www2.eecs.berkeley.edu/Pubs/TechRpts/1997/CSD-97-946.pdf) +- [Context Aware Scanning for Parsing Extensible Languages](https://www-users.cse.umn.edu/~evw/pubs/vanwyk07gpce/vanwyk07gpce.pdf) +- [Efficient and Flexible Incremental Parsing](https://harmonia.cs.berkeley.edu/papers/twagner-parsing.pdf) +- [Incremental Analysis of Real Programming Languages](https://harmonia.cs.berkeley.edu/papers/twagner-glr.pdf) +- [Error Detection and Recovery in LR Parsers](https://web.archive.org/web/20240302031213/https://what-when-how.com/compiler-writing/bottom-up-parsing-compiler-writing-part-13) +- [Error Recovery for LR Parsers](https://apps.dtic.mil/sti/pdfs/ADA043470.pdf) diff --git a/docs/src/using-parsers/1-getting-started.md b/docs/src/using-parsers/1-getting-started.md new file mode 100644 index 00000000..803cc3e8 --- /dev/null +++ b/docs/src/using-parsers/1-getting-started.md @@ -0,0 +1,134 @@ +# Getting Started + +## Building the Library + +To build the library on a POSIX system, just run `make` in the Tree-sitter directory. This will create a static library +called `libtree-sitter.a` as well as dynamic libraries. + +Alternatively, you can incorporate the library in a larger project's build system by adding one source file to the build. +This source file needs two directories to be in the include path when compiled: + +**source file:** + +- `tree-sitter/lib/src/lib.c` + +**include directories:** + +- `tree-sitter/lib/src` +- `tree-sitter/lib/include` + +## The Basic Objects + +There are four main types of objects involved when using Tree-sitter: languages, parsers, syntax trees, and syntax nodes. +In C, these are called `TSLanguage`, `TSParser`, `TSTree`, and `TSNode`. + +- A `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` +is generated by Tree-sitter. Many languages are already available in separate git repositories within the +[Tree-sitter GitHub organization][ts org] and the [Tree-sitter grammars GitHub organization][tsg org]. +See [the next section][creating parsers] for how to create new languages. + +- A `TSParser` is a stateful object that can be assigned a `TSLanguage` and used to produce a `TSTree` based on some +source code. + +- A `TSTree` represents the syntax tree of an entire source code file. It contains `TSNode` instances that indicate the +structure of the source code. It can also be edited and used to produce a new `TSTree` in the event that the +source code changes. + +- A `TSNode` represents a single node in the syntax tree. It tracks its start and end positions in the source code, as +well as its relation to other nodes like its parent, siblings and children. + +## An Example Program + +Here's an example of a simple C program that uses the Tree-sitter [JSON parser][json]. + +```c +// Filename - test-json-parser.c + +#include +#include +#include +#include + +// Declare the `tree_sitter_json` function, which is +// implemented by the `tree-sitter-json` library. +const TSLanguage *tree_sitter_json(void); + +int main() { + // Create a parser. + TSParser *parser = ts_parser_new(); + + // Set the parser's language (JSON in this case). + ts_parser_set_language(parser, tree_sitter_json()); + + // Build a syntax tree based on source code stored in a string. + const char *source_code = "[1, null]"; + TSTree *tree = ts_parser_parse_string( + parser, + NULL, + source_code, + strlen(source_code) + ); + + // Get the root node of the syntax tree. + TSNode root_node = ts_tree_root_node(tree); + + // Get some child nodes. + TSNode array_node = ts_node_named_child(root_node, 0); + TSNode number_node = ts_node_named_child(array_node, 0); + + // Check that the nodes have the expected types. + assert(strcmp(ts_node_type(root_node), "document") == 0); + assert(strcmp(ts_node_type(array_node), "array") == 0); + assert(strcmp(ts_node_type(number_node), "number") == 0); + + // Check that the nodes have the expected child counts. + assert(ts_node_child_count(root_node) == 1); + assert(ts_node_child_count(array_node) == 5); + assert(ts_node_named_child_count(array_node) == 2); + assert(ts_node_child_count(number_node) == 0); + + // Print the syntax tree as an S-expression. + char *string = ts_node_string(root_node); + printf("Syntax tree: %s\n", string); + + // Free all of the heap-allocated memory. + free(string); + ts_tree_delete(tree); + ts_parser_delete(parser); + return 0; +} +``` + +This program requires three components to build: + +1. The Tree-sitter C API from `tree-sitter/api.h` (requiring `tree-sitter/lib/include` in our include path) +2. The Tree-sitter library (`libtree-sitter.a`) +3. The JSON grammar's source code, which we compile directly into the binary + +```sh +clang \ + -I tree-sitter/lib/include \ + test-json-parser.c \ + tree-sitter-json/src/parser.c \ + tree-sitter/libtree-sitter.a \ + -o test-json-parser +./test-json-parser +``` + +When using dynamic linking, you'll need to ensure the shared library is discoverable through `LD_LIBRARY_PATH` or your system's +equivalent environment variable. Here's how to compile with dynamic linking: + +```sh +clang \ + -I tree-sitter/lib/include \ + test-json-parser.c \ + tree-sitter-json/src/parser.c \ + -ltree-sitter \ + -o test-json-parser +./test-json-parser +``` + +[creating parsers]: ../creating-parsers/index.md +[json]: https://github.com/tree-sitter/tree-sitter-json +[ts org]: https://github.com/tree-sitter +[tsg org]: https://github.com/tree-sitter-grammars diff --git a/docs/src/using-parsers/2-basic-parsing.md b/docs/src/using-parsers/2-basic-parsing.md new file mode 100644 index 00000000..17e77324 --- /dev/null +++ b/docs/src/using-parsers/2-basic-parsing.md @@ -0,0 +1,187 @@ +# Basic Parsing + +## Providing the Code + +In the example on the previous page, we parsed source code stored in a simple string using the `ts_parser_parse_string` function: + +```c +TSTree *ts_parser_parse_string( + TSParser *self, + const TSTree *old_tree, + const char *string, + uint32_t length +); +``` + +You may want to parse source code that's stored in a custom data structure, like a [piece table][piece table] or a [rope][rope]. +In this case, you can use the more general `ts_parser_parse` function: + +```c +TSTree *ts_parser_parse( + TSParser *self, + const TSTree *old_tree, + TSInput input +); +``` + +The `TSInput` structure lets you provide your own function for reading a chunk of text at a given byte offset and row/column +position. The function can return text encoded in either UTF-8 or UTF-16. This interface allows you to efficiently parse +text that is stored in your own data structure. + +```c +typedef struct { + void *payload; + const char *(*read)( + void *payload, + uint32_t byte_offset, + TSPoint position, + uint32_t *bytes_read + ); + TSInputEncoding encoding; + DecodeFunction decode; +} TSInput; +``` + +If you want to decode text that is not encoded in UTF-8 or UTF-16, you can set the `decode` field of the input to your function +that will decode text. The signature of the `DecodeFunction` is as follows: + +```c +typedef uint32_t (*DecodeFunction)( + const uint8_t *string, + uint32_t length, + int32_t *code_point +); +``` + +> Note that the `TSInputEncoding` must be set to `TSInputEncodingCustom` for the `decode` function to be called. + +The `string` argument is a pointer to the text to decode, which comes from the `read` function, and the `length` argument +is the length of the `string`. The `code_point` argument is a pointer to an integer that represents the decoded code point, +and should be written to in your `decode` callback. The function should return the number of bytes decoded. + +## Syntax Nodes + +Tree-sitter provides a [DOM][dom]-style interface for inspecting syntax trees. +A syntax node's _type_ is a string that indicates which grammar rule the node represents. + +```c +const char *ts_node_type(TSNode); +``` + +Syntax nodes store their position in the source code both in raw bytes and row/column coordinates: + +```c +uint32_t ts_node_start_byte(TSNode); +uint32_t ts_node_end_byte(TSNode); +typedef struct { + uint32_t row; + uint32_t column; +} TSPoint; +TSPoint ts_node_start_point(TSNode); +TSPoint ts_node_end_point(TSNode); +``` + +## Retrieving Nodes + +Every tree has a _root node_: + +```c +TSNode ts_tree_root_node(const TSTree *); +``` + +Once you have a node, you can access the node's children: + +```c +uint32_t ts_node_child_count(TSNode); +TSNode ts_node_child(TSNode, uint32_t); +``` + +You can also access its siblings and parent: + +```c +TSNode ts_node_next_sibling(TSNode); +TSNode ts_node_prev_sibling(TSNode); +TSNode ts_node_parent(TSNode); +``` + +These methods may all return a _null node_ to indicate, for example, that a node does not _have_ a next sibling. +You can check if a node is null: + +```c +bool ts_node_is_null(TSNode); +``` + +## Named vs Anonymous Nodes + +Tree-sitter produces [_concrete_ syntax trees][cst] — trees that contain nodes for +every individual token in the source code, including things like commas and parentheses. This is important for use-cases +that deal with individual tokens, like [syntax highlighting][syntax highlighting]. But some +types of code analysis are easier to perform using an [_abstract_ syntax tree][ast] — a tree in which the less important +details have been removed. Tree-sitter's trees support these use cases by making a distinction between +_named_ and _anonymous_ nodes. + +Consider a grammar rule like this: + +```js +if_statement: $ => seq("if", "(", $._expression, ")", $._statement); +``` + +A syntax node representing an `if_statement` in this language would have 5 children: the condition expression, the body statement, +as well as the `if`, `(`, and `)` tokens. The expression and the statement would be marked as _named_ nodes, because they +have been given explicit names in the grammar. But the `if`, `(`, and `)` nodes would _not_ be named nodes, because they +are represented in the grammar as simple strings. + +You can check whether any given node is named: + +```c +bool ts_node_is_named(TSNode); +``` + +When traversing the tree, you can also choose to skip over anonymous nodes by using the `_named_` variants of all of the +methods described above: + +```c +TSNode ts_node_named_child(TSNode, uint32_t); +uint32_t ts_node_named_child_count(TSNode); +TSNode ts_node_next_named_sibling(TSNode); +TSNode ts_node_prev_named_sibling(TSNode); +``` + +If you use this group of methods, the syntax tree functions much like an abstract syntax tree. + +## Node Field Names + +To make syntax nodes easier to analyze, many grammars assign unique _field names_ to particular child nodes. +In the [creating parsers][using fields] section, it's explained how to do this in your own grammars. If a syntax node has +fields, you can access its children using their field name: + +```c +TSNode ts_node_child_by_field_name( + TSNode self, + const char *field_name, + uint32_t field_name_length +); +``` + +Fields also have numeric ids that you can use, if you want to avoid repeated string comparisons. You can convert between +strings and ids using the `TSLanguage`: + +```c +uint32_t ts_language_field_count(const TSLanguage *); +const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId); +TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t); +``` + +The field ids can be used in place of the name: + +```c +TSNode ts_node_child_by_field_id(TSNode, TSFieldId); +``` + +[ast]: https://en.wikipedia.org/wiki/Abstract_syntax_tree +[cst]: https://en.wikipedia.org/wiki/Parse_tree +[dom]: https://en.wikipedia.org/wiki/Document_Object_Model +[piece table]: +[rope]: +[syntax highlighting]: https://en.wikipedia.org/wiki/Syntax_highlighting +[using fields]: ../creating-parsers/3-writing-the-grammar.md#using-fields diff --git a/docs/src/using-parsers/3-advanced-parsing.md b/docs/src/using-parsers/3-advanced-parsing.md new file mode 100644 index 00000000..dbab046b --- /dev/null +++ b/docs/src/using-parsers/3-advanced-parsing.md @@ -0,0 +1,161 @@ +# Advanced Parsing + +## Editing + +In applications like text editors, you often need to re-parse a file after its source code has changed. Tree-sitter is designed +to support this use case efficiently. There are two steps required. First, you must _edit_ the syntax tree, which adjusts +the ranges of its nodes so that they stay in sync with the code. + +```c +typedef struct { + uint32_t start_byte; + uint32_t old_end_byte; + uint32_t new_end_byte; + TSPoint start_point; + TSPoint old_end_point; + TSPoint new_end_point; +} TSInputEdit; + +void ts_tree_edit(TSTree *, const TSInputEdit *); +``` + +Then, you can call `ts_parser_parse` again, passing in the old tree. This will create a new tree that internally shares structure +with the old tree. + +When you edit a syntax tree, the positions of its nodes will change. If you have stored any `TSNode` instances outside of +the `TSTree`, you must update their positions separately, using the same `TSInput` value, in order to update their +cached positions. + +```c +void ts_node_edit(TSNode *, const TSInputEdit *); +``` + +This `ts_node_edit` function is _only_ needed in the case where you have retrieved `TSNode` instances _before_ editing the +tree, and then _after_ editing the tree, you want to continue to use those specific node instances. Often, you'll just want +to re-fetch nodes from the edited tree, in which case `ts_node_edit` is not needed. + +## Multi-language Documents + +Sometimes, different parts of a file may be written in different languages. For example, templating languages like [EJS][ejs] +and [ERB][erb] allow you to generate HTML by writing a mixture of HTML and another language like JavaScript or Ruby. + +Tree-sitter handles these types of documents by allowing you to create a syntax tree based on the text in certain +_ranges_ of a file. + +```c +typedef struct { + TSPoint start_point; + TSPoint end_point; + uint32_t start_byte; + uint32_t end_byte; +} TSRange; + +void ts_parser_set_included_ranges( + TSParser *self, + const TSRange *ranges, + uint32_t range_count +); +``` + +For example, consider this ERB document: + +```erb +
    + <% people.each do |person| %> +
  • <%= person.name %>
  • + <% end %> +
+``` + +Conceptually, it can be represented by three syntax trees with overlapping ranges: an ERB syntax tree, a Ruby syntax tree, +and an HTML syntax tree. You could generate these syntax trees with the following code: + +```c +#include +#include + +// These functions are each implemented in their own repo. +const TSLanguage *tree_sitter_embedded_template(void); +const TSLanguage *tree_sitter_html(void); +const TSLanguage *tree_sitter_ruby(void); + +int main(int argc, const char **argv) { + const char *text = argv[1]; + unsigned len = strlen(text); + + // Parse the entire text as ERB. + TSParser *parser = ts_parser_new(); + ts_parser_set_language(parser, tree_sitter_embedded_template()); + TSTree *erb_tree = ts_parser_parse_string(parser, NULL, text, len); + TSNode erb_root_node = ts_tree_root_node(erb_tree); + + // In the ERB syntax tree, find the ranges of the `content` nodes, + // which represent the underlying HTML, and the `code` nodes, which + // represent the interpolated Ruby. + TSRange html_ranges[10]; + TSRange ruby_ranges[10]; + unsigned html_range_count = 0; + unsigned ruby_range_count = 0; + unsigned child_count = ts_node_child_count(erb_root_node); + + for (unsigned i = 0; i < child_count; i++) { + TSNode node = ts_node_child(erb_root_node, i); + if (strcmp(ts_node_type(node), "content") == 0) { + html_ranges[html_range_count++] = (TSRange) { + ts_node_start_point(node), + ts_node_end_point(node), + ts_node_start_byte(node), + ts_node_end_byte(node), + }; + } else { + TSNode code_node = ts_node_named_child(node, 0); + ruby_ranges[ruby_range_count++] = (TSRange) { + ts_node_start_point(code_node), + ts_node_end_point(code_node), + ts_node_start_byte(code_node), + ts_node_end_byte(code_node), + }; + } + } + + // Use the HTML ranges to parse the HTML. + ts_parser_set_language(parser, tree_sitter_html()); + ts_parser_set_included_ranges(parser, html_ranges, html_range_count); + TSTree *html_tree = ts_parser_parse_string(parser, NULL, text, len); + TSNode html_root_node = ts_tree_root_node(html_tree); + + // Use the Ruby ranges to parse the Ruby. + ts_parser_set_language(parser, tree_sitter_ruby()); + ts_parser_set_included_ranges(parser, ruby_ranges, ruby_range_count); + TSTree *ruby_tree = ts_parser_parse_string(parser, NULL, text, len); + TSNode ruby_root_node = ts_tree_root_node(ruby_tree); + + // Print all three trees. + char *erb_sexp = ts_node_string(erb_root_node); + char *html_sexp = ts_node_string(html_root_node); + char *ruby_sexp = ts_node_string(ruby_root_node); + printf("ERB: %s\n", erb_sexp); + printf("HTML: %s\n", html_sexp); + printf("Ruby: %s\n", ruby_sexp); + return 0; +} +``` + +This API allows for great flexibility in how languages can be composed. Tree-sitter is not responsible for mediating the +interactions between languages. Instead, you are free to do that using arbitrary application-specific logic. + +## Concurrency + +Tree-sitter supports multi-threaded use cases by making syntax trees very cheap to copy. + +```c +TSTree *ts_tree_copy(const TSTree *); +``` + +Internally, copying a syntax tree just entails incrementing an atomic reference count. Conceptually, it provides you a new +tree which you can freely query, edit, reparse, or delete on a new thread while continuing to use the original tree on a +different thread. Note that individual `TSTree` instances are _not_ thread safe; you must copy a tree if you want to use +it on multiple threads simultaneously. + +[ejs]: https://ejs.co +[erb]: https://ruby-doc.org/stdlib-2.5.1/libdoc/erb/rdoc/ERB.html diff --git a/docs/src/using-parsers/4-walking-trees.md b/docs/src/using-parsers/4-walking-trees.md new file mode 100644 index 00000000..33da38e1 --- /dev/null +++ b/docs/src/using-parsers/4-walking-trees.md @@ -0,0 +1,42 @@ +# Walking Trees with Tree Cursors + +You can access every node in a syntax tree using the `TSNode` APIs [described earlier][retrieving nodes], but if you need +to access a large number of nodes, the fastest way to do so is with a _tree cursor_. A cursor is a stateful object that +allows you to walk a syntax tree with maximum efficiency. + +
+ +Note that the given input node is considered the root of the cursor, and the cursor cannot walk outside this node. +Going to the parent or any sibling of the root node will always return `false`. + +This has no unexpected effects if the given input node is the actual `root` node of the tree, but is something to keep in +mind when using cursors constructed with a node that is not the `root` node. +
+ +You can initialize a cursor from any node: + +```c +TSTreeCursor ts_tree_cursor_new(TSNode); +``` + +You can move the cursor around the tree: + +```c +bool ts_tree_cursor_goto_first_child(TSTreeCursor *); +bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); +bool ts_tree_cursor_goto_parent(TSTreeCursor *); +``` + +These methods return `true` if the cursor successfully moved and `false` if there was no node to move to. + +You can always retrieve the cursor's current node, as well as the [field name][node-field-names] that is associated with +the current node. + +```c +TSNode ts_tree_cursor_current_node(const TSTreeCursor *); +const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); +TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *); +``` + +[retrieving nodes]: ./2-basic-parsing.md#retrieving-nodes +[node-field-names]: ./2-basic-parsing.md#node-field-names diff --git a/docs/src/using-parsers/6-static-node-types.md b/docs/src/using-parsers/6-static-node-types.md new file mode 100644 index 00000000..5976d0bc --- /dev/null +++ b/docs/src/using-parsers/6-static-node-types.md @@ -0,0 +1,162 @@ +# Static Node Types + +In languages with static typing, it can be helpful for syntax trees to provide specific type information about individual +syntax nodes. Tree-sitter makes this information available via a generated file called `node-types.json`. This _node types_ +file provides structured data about every possible syntax node in a grammar. + +You can use this data to generate type declarations in statically-typed programming languages. + +The node types file contains an array of objects, each of which describes a particular type of syntax node using the +following entries: + +## Basic Info + +Every object in this array has these two entries: + +- `"type"` — A string that indicates, which grammar rule the node represents. This corresponds to the `ts_node_type` function +described [here][syntax nodes]. +- `"named"` — A boolean that indicates whether this kind of node corresponds to a rule name in the grammar or just a string +literal. See [here][named-vs-anonymous-nodes] for more info. + +Examples: + +```json +{ + "type": "string_literal", + "named": true +} +{ + "type": "+", + "named": false +} +``` + +Together, these two fields constitute a unique identifier for a node type; no two top-level objects in the `node-types.json` +should have the same values for both `"type"` and `"named"`. + +## Internal Nodes + +Many syntax nodes can have _children_. The node type object describes the possible children that a node can have using the +following entries: + +- `"fields"` — An object that describes the possible [fields][node-field-names] that the node can have. The keys of this +object are field names, and the values are _child type_ objects, described below. +- `"children"` — Another _child type_ object that describes all the node's possible _named_ children _without_ fields. + +A _child type_ object describes a set of child nodes using the following entries: + +- `"required"` — A boolean indicating whether there is always _at least one_ node in this set. +- `"multiple"` — A boolean indicating whether there can be _multiple_ nodes in this set. +- `"types"`- An array of objects that represent the possible types of nodes in this set. Each object has two keys: `"type"` +and `"named"`, whose meanings are described above. + +Example with fields: + +```json +{ + "type": "method_definition", + "named": true, + "fields": { + "body": { + "multiple": false, + "required": true, + "types": [{ "type": "statement_block", "named": true }] + }, + "decorator": { + "multiple": true, + "required": false, + "types": [{ "type": "decorator", "named": true }] + }, + "name": { + "multiple": false, + "required": true, + "types": [ + { "type": "computed_property_name", "named": true }, + { "type": "property_identifier", "named": true } + ] + }, + "parameters": { + "multiple": false, + "required": true, + "types": [{ "type": "formal_parameters", "named": true }] + } + } +} +``` + +Example with children: + +```json +{ + "type": "array", + "named": true, + "fields": {}, + "children": { + "multiple": true, + "required": false, + "types": [ + { "type": "_expression", "named": true }, + { "type": "spread_element", "named": true } + ] + } +} +``` + +## Supertype Nodes + +In Tree-sitter grammars, there are usually certain rules that represent abstract _categories_ of syntax nodes (e.g. "expression", +"type", "declaration"). In the `grammar.js` file, these are often written as [hidden rules][hidden rules] +whose definition is a simple [`choice`][grammar dsl] where each member is just a single symbol. + +Normally, hidden rules are not mentioned in the node types file, since they don't appear in the syntax tree. But if you add +a hidden rule to the grammar's [`supertypes` list][grammar dsl], then it _will_ show up in the node +types file, with the following special entry: + +- `"subtypes"` — An array of objects that specify the _types_ of nodes that this 'supertype' node can wrap. + +Example: + +```json +{ + "type": "_declaration", + "named": true, + "subtypes": [ + { "type": "class_declaration", "named": true }, + { "type": "function_declaration", "named": true }, + { "type": "generator_function_declaration", "named": true }, + { "type": "lexical_declaration", "named": true }, + { "type": "variable_declaration", "named": true } + ] +} +``` + +Supertype nodes will also appear elsewhere in the node types file, as children of other node types, in a way that corresponds +with how the supertype rule was used in the grammar. This can make the node types much shorter and easier to read, because +a single supertype will take the place of multiple subtypes. + +Example: + +```json +{ + "type": "export_statement", + "named": true, + "fields": { + "declaration": { + "multiple": false, + "required": false, + "types": [{ "type": "_declaration", "named": true }] + }, + "source": { + "multiple": false, + "required": false, + "types": [{ "type": "string", "named": true }] + } + } +} +``` + +[grammar dsl]: ../creating-parsers/2-the-grammar-dsl.md +[hidden rules]: ../creating-parsers/3-writing-the-grammar.md#hiding-rules +[named-vs-anonymous-nodes]: ./2-basic-parsing.md#named-vs-anonymous-nodes +[node-field-names]: ./2-basic-parsing.md#node-field-names +[syntax nodes]: ./2-basic-parsing.md#syntax-nodes diff --git a/docs/src/using-parsers/index.md b/docs/src/using-parsers/index.md new file mode 100644 index 00000000..48d61599 --- /dev/null +++ b/docs/src/using-parsers/index.md @@ -0,0 +1,27 @@ +# Using Parsers + +This guide covers the fundamental concepts of using Tree-sitter, which is applicable across all programming languages. +Although we'll explore some C-specific details that are valuable for direct C API usage or creating new language bindings, +the core concepts remain the same. + +Tree-sitter's parsing functionality is implemented through its C API, with all functions documented in the [tree_sitter/api.h][api.h] +header file, but if you're working in another language, you can use one of the following bindings found [here](../index.md#language-bindings), +each providing idiomatic access to Tree-sitter's functionality. Of these bindings, the official ones have their own API docs +hosted online at the following pages: + +- [Go][go] +- [Java] +- [JavaScript (Node.js)][javascript] +- [Kotlin][kotlin] +- [Python][python] +- [Rust][rust] +- [Zig][zig] + +[api.h]: https://github.com/tree-sitter/tree-sitter/blob/master/lib/include/tree_sitter/api.h +[go]: https://pkg.go.dev/github.com/tree-sitter/go-tree-sitter +[java]: https://tree-sitter.github.io/java-tree-sitter +[javascript]: https://tree-sitter.github.io/node-tree-sitter +[kotlin]: https://tree-sitter.github.io/kotlin-tree-sitter +[python]: https://tree-sitter.github.io/py-tree-sitter +[rust]: https://docs.rs/tree-sitter +[zig]: https://tree-sitter.github.io/zig-tree-sitter diff --git a/docs/src/using-parsers/queries/1-syntax.md b/docs/src/using-parsers/queries/1-syntax.md new file mode 100644 index 00000000..5edd0047 --- /dev/null +++ b/docs/src/using-parsers/queries/1-syntax.md @@ -0,0 +1,101 @@ +# Query Syntax + +A _query_ consists of one or more _patterns_, where each pattern is an [S-expression][s-exp] that matches a certain set of +nodes in a syntax tree. The expression to match a given node consists of a pair of parentheses containing two things: the +node's type, and optionally, a series of other S-expressions that match the node's children. For example, this pattern would +match any `binary_expression` node whose children are both `number_literal` nodes: + +```query +(binary_expression (number_literal) (number_literal)) +``` + +Children can also be omitted. For example, this would match any `binary_expression` where at least _one_ of child is a +`string_literal` node: + +```query +(binary_expression (string_literal)) +``` + +## Fields + +In general, it's a good idea to make patterns more specific by specifying [field names][node-field-names] associated with +child nodes. You do this by prefixing a child pattern with a field name followed by a colon. For example, this pattern would +match an `assignment_expression` node where the `left` child is a `member_expression` whose `object` is a `call_expression`. + +```query +(assignment_expression + left: (member_expression + object: (call_expression))) +``` + +## Negated Fields + +You can also constrain a pattern so that it only matches nodes that _lack_ a certain field. To do this, add a field name +prefixed by a `!` within the parent pattern. For example, this pattern would match a class declaration with no type parameters: + +```query +(class_declaration + name: (identifier) @class_name + !type_parameters) +``` + +## Anonymous Nodes + +The parenthesized syntax for writing nodes only applies to [named nodes][named-vs-anonymous-nodes]. To match specific anonymous +nodes, you write their name between double quotes. For example, this pattern would match any `binary_expression` where the +operator is `!=` and the right side is `null`: + +```query +(binary_expression + operator: "!=" + right: (null)) +``` + +## Special Nodes + +### The Wildcard Node + +A wildcard node is represented with an underscore (`_`), it matches any node. +This is similar to `.` in regular expressions. +There are two types, `(_)` will match any named node, +and `_` will match any named or anonymous node. + +For example, this pattern would match any node inside a call: + +```query +(call (_) @call.inner) +``` + +### The `ERROR` Node + +When the parser encounters text it does not recognize, it represents this node +as `(ERROR)` in the syntax tree. These error nodes can be queried just like +normal nodes: + +```scheme +(ERROR) @error-node +``` + +### The `MISSING` Node + +If the parser is able to recover from erroneous text by inserting a missing token and then reducing, it will insert that +missing node in the final tree so long as that tree has the lowest error cost. These missing nodes appear as seemingly normal +nodes in the tree, but they are zero tokens wide, and are internally represented as a property of the actual terminal node +that was inserted, instead of being its own kind of node, like the `ERROR` node. These special missing nodes can be queried +using `(MISSING)`: + +```scheme +(MISSING) @missing-node +``` + +This is useful when attempting to detect all syntax errors in a given parse tree, since these missing node are not captured +by `(ERROR)` queries. Specific missing node types can also be queried: + +```scheme +(MISSING identifier) @missing-identifier +(MISSING ";") @missing-semicolon +``` + +[node-field-names]: ../2-basic-parsing.md#node-field-names +[named-vs-anonymous-nodes]: ../2-basic-parsing.md#named-vs-anonymous-nodes +[s-exp]: https://en.wikipedia.org/wiki/S-expression diff --git a/docs/src/using-parsers/queries/2-operators.md b/docs/src/using-parsers/queries/2-operators.md new file mode 100644 index 00000000..6f9a8ca4 --- /dev/null +++ b/docs/src/using-parsers/queries/2-operators.md @@ -0,0 +1,151 @@ +# Operators + +## Capturing Nodes + +When matching patterns, you may want to process specific nodes within the pattern. Captures allow you to associate names +with specific nodes in a pattern, so that you can later refer to those nodes by those names. Capture names are written _after_ +the nodes that they refer to, and start with an `@` character. + +For example, this pattern would match any assignment of a `function` to an `identifier`, and it would associate the name +`the-function-name` with the identifier: + +```query +(assignment_expression + left: (identifier) @the-function-name + right: (function)) +``` + +And this pattern would match all method definitions, associating the name `the-method-name` with the method name, `the-class-name` +with the containing class name: + +```query +(class_declaration + name: (identifier) @the-class-name + body: (class_body + (method_definition + name: (property_identifier) @the-method-name))) +``` + +## Quantification Operators + +You can match a repeating sequence of sibling nodes using the postfix `+` and `*` _repetition_ operators, which work analogously +to the `+` and `*` operators [in regular expressions][regex]. The `+` operator matches _one or more_ repetitions of a pattern, +and the `*` operator matches _zero or more_. + +For example, this pattern would match a sequence of one or more comments: + +```query +(comment)+ +``` + +This pattern would match a class declaration, capturing all of the decorators if any were present: + +```query +(class_declaration + (decorator)* @the-decorator + name: (identifier) @the-name) +``` + +You can also mark a node as optional using the `?` operator. For example, this pattern would match all function calls, capturing +a string argument if one was present: + +```query +(call_expression + function: (identifier) @the-function + arguments: (arguments (string)? @the-string-arg)) +``` + +## Grouping Sibling Nodes + +You can also use parentheses for grouping a sequence of _sibling_ nodes. For example, this pattern would match a comment +followed by a function declaration: + +```query +( + (comment) + (function_declaration) +) +``` + +Any of the quantification operators mentioned above (`+`, `*`, and `?`) can also be applied to groups. For example, this +pattern would match a comma-separated series of numbers: + +```query +( + (number) + ("," (number))* +) +``` + +## Alternations + +An alternation is written as a pair of square brackets (`[]`) containing a list of alternative patterns. +This is similar to _character classes_ from regular expressions (`[abc]` matches either a, b, or c). + +For example, this pattern would match a call to either a variable or an object property. +In the case of a variable, capture it as `@function`, and in the case of a property, capture it as `@method`: + +```query +(call_expression + function: [ + (identifier) @function + (member_expression + property: (property_identifier) @method) + ]) +``` + +This pattern would match a set of possible keyword tokens, capturing them as `@keyword`: + +```query +[ + "break" + "delete" + "else" + "for" + "function" + "if" + "return" + "try" + "while" +] @keyword +``` + +## Anchors + +The anchor operator, `.`, is used to constrain the ways in which child patterns are matched. It has different behaviors +depending on where it's placed inside a query. + +When `.` is placed before the _first_ child within a parent pattern, the child will only match when it is the first named +node in the parent. For example, the below pattern matches a given `array` node at most once, assigning the `@the-element` +capture to the first `identifier` node in the parent `array`: + +```query +(array . (identifier) @the-element) +``` + +Without this anchor, the pattern would match once for every identifier in the array, with `@the-element` bound +to each matched identifier. + +Similarly, an anchor placed after a pattern's _last_ child will cause that child pattern to only match nodes that are the +last named child of their parent. The below pattern matches only nodes that are the last named child within a `block`. + +```query +(block (_) @last-expression .) +``` + +Finally, an anchor _between_ two child patterns will cause the patterns to only match nodes that are immediate siblings. +The pattern below, given a long dotted name like `a.b.c.d`, will only match pairs of consecutive identifiers: +`a, b`, `b, c`, and `c, d`. + +```query +(dotted_name + (identifier) @prev-id + . + (identifier) @next-id) +``` + +Without the anchor, non-consecutive pairs like `a, c` and `b, d` would also be matched. + +The restrictions placed on a pattern by an anchor operator ignore anonymous nodes. + +[regex]: https://en.wikipedia.org/wiki/Regular_expression#Basic_concepts diff --git a/docs/src/using-parsers/queries/3-predicates-and-directives.md b/docs/src/using-parsers/queries/3-predicates-and-directives.md new file mode 100644 index 00000000..23244969 --- /dev/null +++ b/docs/src/using-parsers/queries/3-predicates-and-directives.md @@ -0,0 +1,199 @@ +# Predicates + +You can also specify arbitrary metadata and conditions associated with a pattern +by adding _predicate_ S-expressions anywhere within your pattern. Predicate S-expressions +start with a _predicate name_ beginning with a `#` character, and ending with a `?` character. After that, they can +contain an arbitrary number of `@`-prefixed capture names or strings. + +Tree-sitter's CLI supports the following predicates by default: + +## The `eq?` predicate + +This family of predicates allows you to match against a single capture or string +value. + +The first argument to this predicate must be a capture, but the second can be either a capture to +compare the two captures' text, or a string to compare first capture's text +against. + +The base predicate is `#eq?`, but its complement, `#not-eq?`, can be used to _not_ +match a value. Additionally, you can prefix either of these with `any-` to match +if _any_ of the nodes match the predicate. This is only useful when dealing with +quantified captures, as by default a quantified capture will only match if _all_ the captured nodes match the predicate. + +Thus, there are four predicates in total: + +- `#eq?` +- `#not-eq?` +- `#any-eq?` +- `#any-not-eq?` + +Consider the following example targeting C: + +```query +((identifier) @variable.builtin + (#eq? @variable.builtin "self")) +``` + +This pattern would match any identifier that is `self`. + +Now consider the following example: + +```query +( + (pair + key: (property_identifier) @key-name + value: (identifier) @value-name) + (#eq? @key-name @value-name) +) +``` + +This pattern would match key-value pairs where the `value` is an identifier +with the same text as the key (meaning they are the same): + +As mentioned earlier, the `any-` prefix is meant for use with quantified captures. Here's +an example finding an empty comment within a group of comments: + +```query +((comment)+ @comment.empty + (#any-eq? @comment.empty "//")) +``` + +## The `match?` predicate + +These predicates are similar to the `eq?` predicates, but they use regular expressions +to match against the capture's text instead of string comparisons. + +The first argument must be a capture, and the second must be a string containing +a regular expression. + +Like the `eq?` predicate family, we can tack on `not-` to the beginning of the predicate +to negate the match, and `any-` to match if _any_ of the nodes in a quantified capture match the predicate. + +This pattern matches identifiers written in `SCREAMING_SNAKE_CASE`. + +```query +((identifier) @constant + (#match? @constant "^[A-Z][A-Z_]+")) +``` + +This query identifies documentation comments in C that begin with three forward slashes (`///`). + +```query +((comment)+ @comment.documentation + (#match? @comment.documentation "^///\\s+.*")) +``` + +This query finds C code embedded in Go comments that appear just before a "C" import statement. +These are known as [`Cgo`][cgo] comments and are used to inject C code into Go programs. + +```query +((comment)+ @injection.content + . + (import_declaration + (import_spec path: (interpreted_string_literal) @_import_c)) + (#eq? @_import_c "\"C\"") + (#match? @injection.content "^//")) +``` + +## The `any-of?` predicate + +The `any-of?` predicate allows you to match a capture against multiple strings, +and will match if the capture's text is equal to any of the strings. + +The query below will match any of the builtin variables in JavaScript. + +```query +((identifier) @variable.builtin + (#any-of? @variable.builtin + "arguments" + "module" + "console" + "window" + "document")) +``` + +## The `is?` predicate + +The `is?` predicate allows you to assert that a capture has a given property. This isn't widely used, but the CLI uses it +to determine whether a given node is a local variable or not, for example: + +```query +((identifier) @variable.builtin + (#match? @variable.builtin "^(arguments|module|console|window|document)$") + (#is-not? local)) +``` + +This pattern would match any builtin variable that is not a local variable, because the `#is-not? local` predicate is used. + +# Directives + +Similar to predicates, directives are a way to associate arbitrary metadata with a pattern. The only difference between predicates +and directives is that directives end in a `!` character instead of `?` character. + +Tree-sitter's CLI supports the following directives by default: + +## The `set!` directive + +This directive allows you to associate key-value pairs with a pattern. The key and value can be any arbitrary text that you +see fit. + +```query +((comment) @injection.content + (#lua-match? @injection.content "/[*\/][!*\/]