diff --git a/spec/fixtures/external_scanners/extra_external_tokens.c b/spec/fixtures/external_scanners/extra_external_tokens.c deleted file mode 100644 index 5c409639..00000000 --- a/spec/fixtures/external_scanners/extra_external_tokens.c +++ /dev/null @@ -1,42 +0,0 @@ -#include - -enum { - COMMENT, -}; - -void *tree_sitter_extra_external_tokens_external_scanner_create() { - return NULL; -} - -void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) { -} - -bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { - return true; -} - -void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { -} - -bool tree_sitter_extra_external_tokens_external_scanner_scan( - void *payload, TSLexer *lexer, const bool *whitelist) { - - while (lexer->lookahead == ' ') { - lexer->advance(lexer, true); - } - - if (lexer->lookahead == '#') { - lexer->advance(lexer, false); - while (lexer->lookahead != '\n') { - lexer->advance(lexer, false); - } - - lexer->result_symbol = COMMENT; - return true; - } - - return false; -} - -void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) { -} diff --git a/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt new file mode 100644 index 00000000..06a7bf0b --- /dev/null +++ b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt @@ -0,0 +1,32 @@ +================================================ +anonymous tokens defined with character classes +================================================ +1234 +--- + +(first_rule) + +================================================= +anonymous tokens defined with LF escape sequence +================================================= + + +--- + +(first_rule) + +================================================= +anonymous tokens defined with CR escape sequence +================================================= + +--- + +(first_rule) + +================================================ +anonymous tokens with quotes +================================================ +'hello' +--- + +(first_rule) diff --git a/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json new file mode 100644 index 00000000..d2613776 --- /dev/null +++ b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json @@ -0,0 +1,14 @@ +{ + "name": "anonymous_tokens_with_escaped_chars", + "rules": { + "first_rule": { + "type": "CHOICE", + "members": [ + {"type": "STRING", "value": "\n"}, + {"type": "STRING", "value": "\r"}, + {"type": "STRING", "value": "'hello'"}, + {"type": "PATTERN", "value": "\\d+"} + ] + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_left/corpus.txt b/spec/fixtures/test_grammars/associativity_left/corpus.txt new file mode 100644 index 00000000..4ab8e0db --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_left/corpus.txt @@ -0,0 +1,8 @@ +=================== +chained operations +=================== +x+y+z +--- +(expression (math_operation + (expression (math_operation (expression (identifier)) (expression (identifier)))) + (expression (identifier)))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_left/grammar.json b/spec/fixtures/test_grammars/associativity_left/grammar.json new file mode 100644 index 00000000..b1a25914 --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_left/grammar.json @@ -0,0 +1,31 @@ +{ + "name": "associativity_left", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "math_operation"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "math_operation": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_missing/expected_error.txt b/spec/fixtures/test_grammars/associativity_missing/expected_error.txt new file mode 100644 index 00000000..f9cc955d --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_missing/expected_error.txt @@ -0,0 +1,13 @@ +Unresolved conflict for symbol sequence: + + expression '+' expression • '+' … + +Possible interpretations: + + 1: (math_operation expression '+' expression) • '+' … + 2: expression '+' (math_operation expression • '+' expression) + +Possible resolutions: + + 1: Specify a left or right associativity in `math_operation` + 2: Add a conflict for these rules: `math_operation` diff --git a/spec/fixtures/test_grammars/associativity_missing/grammar.json b/spec/fixtures/test_grammars/associativity_missing/grammar.json new file mode 100644 index 00000000..e5bd9d83 --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_missing/grammar.json @@ -0,0 +1,27 @@ +{ + "name": "associativity_missing", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "math_operation"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "math_operation": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_right/corpus.txt b/spec/fixtures/test_grammars/associativity_right/corpus.txt new file mode 100644 index 00000000..280bbc31 --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_right/corpus.txt @@ -0,0 +1,8 @@ +=================== +chained operations +=================== +x+y+z +--- +(expression (math_operation + (expression (identifier)) + (expression (math_operation (expression (identifier)) (expression (identifier)))))) diff --git a/spec/fixtures/test_grammars/associativity_right/grammar.json b/spec/fixtures/test_grammars/associativity_right/grammar.json new file mode 100644 index 00000000..80ce1ebb --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_right/grammar.json @@ -0,0 +1,31 @@ +{ + "name": "associativity_right", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "math_operation"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "math_operation": { + "type": "PREC_RIGHT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt b/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt new file mode 100644 index 00000000..a38dd8b5 --- /dev/null +++ b/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt @@ -0,0 +1,15 @@ +Unresolved conflict for symbol sequence: + + expression '+' expression • '*' … + +Possible interpretations: + + 1: (sum expression '+' expression) • '*' … + 2: expression '+' (product expression • '*' expression) + 3: expression '+' (other_thing expression • '*' '*') + +Possible resolutions: + + 1: Specify a higher precedence in `product` and `other_thing` than in the other rules. + 2: Specify a higher precedence in `sum` than in the other rules. + 3: Add a conflict for these rules: `sum` `product` `other_thing` diff --git a/spec/fixtures/test_grammars/conflicting_precedence/grammar.json b/spec/fixtures/test_grammars/conflicting_precedence/grammar.json new file mode 100644 index 00000000..4e82de64 --- /dev/null +++ b/spec/fixtures/test_grammars/conflicting_precedence/grammar.json @@ -0,0 +1,58 @@ +{ + "name": "conflicting_precedence", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "product"}, + {"type": "SYMBOL", "name": "other_thing"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "product": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "other_thing": { + "type": "PREC_LEFT", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "STRING", "value": "*"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt b/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt new file mode 100644 index 00000000..39b3d5fa --- /dev/null +++ b/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt @@ -0,0 +1,2 @@ +The rule `rule_2` matches the empty string. +Tree-sitter currently does not support syntactic rules that match the empty string. diff --git a/spec/fixtures/test_grammars/epsilon_rules/grammar.json b/spec/fixtures/test_grammars/epsilon_rules/grammar.json new file mode 100644 index 00000000..5be5b983 --- /dev/null +++ b/spec/fixtures/test_grammars/epsilon_rules/grammar.json @@ -0,0 +1,15 @@ +{ + "name": "epsilon_rules", + + "rules": { + "rule_1": {"type": "SYMBOL", "name": "rule_2"}, + + "rule_2": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "rule_1"}, + {"type": "BLANK"} + ] + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt b/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt new file mode 100644 index 00000000..4d691420 --- /dev/null +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt @@ -0,0 +1,41 @@ +========================================= +single-line statements - internal tokens +========================================= + +a b + +--- + +(statement (variable) (variable) (line_break)) + +========================================= +multi-line statements - internal tokens +========================================= + +a +b + +--- + +(statement (variable) (variable) (line_break)) + +========================================= +single-line statements - external tokens +========================================= + +'hello' 'world' + +--- + +(statement (string) (string) (line_break)) + +========================================= +multi-line statements - external tokens +========================================= + +'hello' +'world' + +--- + +(statement (string) (string) (line_break)) diff --git a/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json b/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json new file mode 100644 index 00000000..f24e1c1c --- /dev/null +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json @@ -0,0 +1,36 @@ +{ + "name": "external_and_internal_tokens", + + "externals": [ + "string", + "line_break" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"}, + "line_break": {"type": "STRING", "value": "\n"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md b/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md new file mode 100644 index 00000000..14ae934f --- /dev/null +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md @@ -0,0 +1 @@ +This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an *internal* token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field. \ No newline at end of file diff --git a/spec/fixtures/external_scanners/shared_external_tokens.c b/spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c similarity index 62% rename from spec/fixtures/external_scanners/shared_external_tokens.c rename to spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c index 0bee00d8..4d0acd0a 100644 --- a/spec/fixtures/external_scanners/shared_external_tokens.c +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c @@ -1,4 +1,3 @@ -#include #include enum { @@ -6,21 +5,17 @@ enum { LINE_BREAK }; -void *tree_sitter_shared_external_tokens_external_scanner_create() { - return NULL; -} +void *tree_sitter_external_and_internal_tokens_external_scanner_create() { return NULL; } -void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) { -} +void tree_sitter_external_and_internal_tokens_external_scanner_destroy(void *payload) {} -bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { - return true; -} +void tree_sitter_external_and_internal_tokens_external_scanner_reset(void *payload) {} -void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { -} +bool tree_sitter_external_and_internal_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } -bool tree_sitter_shared_external_tokens_external_scanner_scan( +void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_and_internal_tokens_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { // If a line-break is a valid lookahead token, only skip spaces. @@ -58,6 +53,3 @@ bool tree_sitter_shared_external_tokens_external_scanner_scan( return false; } - -void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) { -} diff --git a/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt b/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt new file mode 100644 index 00000000..ceac4b8a --- /dev/null +++ b/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt @@ -0,0 +1,10 @@ +======================== +extra external tokens +======================== + +x = # a comment +y + +--- + +(assignment (variable) (comment) (variable)) diff --git a/spec/fixtures/test_grammars/external_extra_tokens/grammar.json b/spec/fixtures/test_grammars/external_extra_tokens/grammar.json new file mode 100644 index 00000000..ed13b34a --- /dev/null +++ b/spec/fixtures/test_grammars/external_extra_tokens/grammar.json @@ -0,0 +1,25 @@ +{ + "name": "external_extra_tokens", + + "externals": [ + "comment" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + "assignment": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "variable"}, + {"type": "STRING", "value": "="}, + {"type": "SYMBOL", "name": "variable"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/external_extra_tokens/scanner.c b/spec/fixtures/test_grammars/external_extra_tokens/scanner.c new file mode 100644 index 00000000..4bd3e22e --- /dev/null +++ b/spec/fixtures/test_grammars/external_extra_tokens/scanner.c @@ -0,0 +1,36 @@ +#include + +enum { + COMMENT, +}; + +void *tree_sitter_external_extra_tokens_external_scanner_create() { return NULL; } + +void tree_sitter_external_extra_tokens_external_scanner_destroy(void *payload) {} + +void tree_sitter_external_extra_tokens_external_scanner_reset(void *payload) {} + +bool tree_sitter_external_extra_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } + +void tree_sitter_external_extra_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_extra_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '#') { + lexer->advance(lexer, false); + while (lexer->lookahead != '\n') { + lexer->advance(lexer, false); + } + + lexer->result_symbol = COMMENT; + return true; + } + + return false; +} + diff --git a/spec/fixtures/test_grammars/external_tokens/corpus.txt b/spec/fixtures/test_grammars/external_tokens/corpus.txt new file mode 100644 index 00000000..94153c16 --- /dev/null +++ b/spec/fixtures/test_grammars/external_tokens/corpus.txt @@ -0,0 +1,22 @@ +======================== +simple external tokens +========================= + +x + %(sup (external) scanner?) + +--- + +(expression (sum (expression (identifier)) (expression (string)))) + +================================== +external tokens that require state +================================== + +%{sup {} #{x + y} {} scanner?} + +--- + +(expression (string + (expression (sum + (expression (identifier)) + (expression (identifier)))))) diff --git a/spec/fixtures/test_grammars/external_tokens/grammar.json b/spec/fixtures/test_grammars/external_tokens/grammar.json new file mode 100644 index 00000000..8a175404 --- /dev/null +++ b/spec/fixtures/test_grammars/external_tokens/grammar.json @@ -0,0 +1,57 @@ +{ + "name": "external_tokens", + + "externals": [ + "_percent_string", + "_percent_string_start", + "_percent_string_end" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "string": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "_percent_string"}, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_percent_string_start"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "_percent_string_end"} + ] + }, + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "\\a+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/external_scanners/percent_strings.c b/spec/fixtures/test_grammars/external_tokens/scanner.c similarity index 80% rename from spec/fixtures/external_scanners/percent_strings.c rename to spec/fixtures/test_grammars/external_tokens/scanner.c index 9f68696e..7622e74d 100644 --- a/spec/fixtures/external_scanners/percent_strings.c +++ b/spec/fixtures/test_grammars/external_tokens/scanner.c @@ -1,4 +1,3 @@ -#include #include enum { @@ -13,7 +12,7 @@ typedef struct { uint32_t depth; } Scanner; -void *tree_sitter_external_scanner_example_external_scanner_create() { +void *tree_sitter_external_tokens_external_scanner_create() { Scanner *scanner = malloc(sizeof(Scanner)); *scanner = (Scanner){ .open_delimiter = 0, @@ -23,7 +22,17 @@ void *tree_sitter_external_scanner_example_external_scanner_create() { return scanner; } -bool tree_sitter_external_scanner_example_external_scanner_scan( +void tree_sitter_external_tokens_external_scanner_destroy(void *payload) { + free(payload); +} + +void tree_sitter_external_tokens_external_scanner_reset(void *payload) {} + +bool tree_sitter_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } + +void tree_sitter_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_tokens_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { Scanner *scanner = payload; @@ -103,16 +112,3 @@ bool tree_sitter_external_scanner_example_external_scanner_scan( return false; } -void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) { -} - -bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) { - return true; -} - -void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) { -} - -void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) { - free(payload); -} diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt b/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt new file mode 100644 index 00000000..b1be0828 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt @@ -0,0 +1,15 @@ +Unresolved conflict for symbol sequence: + + identifier • '{' … + +Possible interpretations: + + 1: (expression identifier) • '{' … + 2: (function_call identifier • block) + +Possible resolutions: + + 1: Specify a higher precedence in `function_call` than in the other rules. + 2: Specify a higher precedence in `expression` than in the other rules. + 3: Specify a left or right associativity in `expression` + 4: Add a conflict for these rules: `expression` `function_call` diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json b/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json new file mode 100644 index 00000000..19852708 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json @@ -0,0 +1,63 @@ +{ + "name": "precedence_on_single_child_missing", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "function_call": { + "type": "PREC_RIGHT", + "value": 0, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + ] + } + }, + + "block": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "{"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "}"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md b/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md new file mode 100644 index 00000000..9db7345f --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md @@ -0,0 +1,14 @@ +This language has function calls similar to Ruby's, with no parentheses required, and optional blocks. + +There is a shift/reduce conflict here: + +``` +foo bar { baz } + ^ +``` + +The possible actions are: +1. `reduce(expression, 1)` - `bar` is an expression being passed to the `foo` function. +2. `shift` - `bar` is a function being called with the block `{ baz }` + +The grammars `precedence_on_single_child_negative` and `precedence_on_single_child_positive` show possible resolutions to this conflict. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt b/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt new file mode 100644 index 00000000..69678dae --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt @@ -0,0 +1,12 @@ +=========================== +function calls with blocks +=========================== + +foo bar { baz } + +--- + +(expression (function_call + (identifier) + (expression (identifier)) + (block (expression (identifier))))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json b/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json new file mode 100644 index 00000000..fc237f54 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json @@ -0,0 +1,63 @@ +{ + "name": "precedence_on_single_child_negative", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "function_call": { + "type": "PREC_RIGHT", + "value": -1, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + ] + } + }, + + "block": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "{"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "}"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md b/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md new file mode 100644 index 00000000..5b2cd804 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md @@ -0,0 +1 @@ +This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt b/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt new file mode 100644 index 00000000..ee01d488 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt @@ -0,0 +1,13 @@ +=========================== +function calls with blocks +=========================== + +foo bar { baz } + +--- + +(expression (function_call + (identifier) + (expression (function_call + (identifier) + (block (expression (identifier))))))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json b/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json new file mode 100644 index 00000000..7ffa73ed --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json @@ -0,0 +1,63 @@ +{ + "name": "precedence_on_single_child_positive", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "function_call": { + "type": "PREC_RIGHT", + "value": 1, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + ] + } + }, + + "block": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "{"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "}"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md b/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md new file mode 100644 index 00000000..3bb78e41 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md @@ -0,0 +1 @@ +This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a positive precedence. This causes shifting the `{` token as part of `function_call` to be preferred over reducing the `bar` variable to an expression. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt b/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt new file mode 100644 index 00000000..1b3666f6 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt @@ -0,0 +1,24 @@ +========================================== +curly brace blocks with high precedence +========================================== + +a b {} + +--- + +(expression (function_call + (identifier) + (expression (function_call (identifier) (block))))) + +========================================== +do blocks with low precedence +========================================== + +a b do end + +--- + +(expression (function_call + (identifier) + (expression (identifier)) + (do_block))) diff --git a/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json b/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json new file mode 100644 index 00000000..d05db765 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json @@ -0,0 +1,135 @@ +{ + "name": "precedence_on_subsequence", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "scope_resolution"} + ] + } + }, + + "function_call": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + + { + "type": "PREC", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + } + }, + + { + "type": "PREC", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "do_block"} + ] + } + }, + + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + { + "type": "PREC", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + } + ] + }, + + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + { + "type": "PREC", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "do_block"} + ] + } + } + ] + } + ] + }, + + "scope_resolution": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "::"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "::"}, + {"type": "SYMBOL", "name": "expression"}, + ] + } + ] + } + }, + + "block": { + "type": "STRING", + "value": "{}" + }, + + "do_block": { + "type": "STRING", + "value": "do end" + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/readme.md b/spec/fixtures/test_grammars/readme.md new file mode 100644 index 00000000..a8f0449d --- /dev/null +++ b/spec/fixtures/test_grammars/readme.md @@ -0,0 +1,3 @@ +These small grammars demonstrate specific features or test for certain specific regressions. + +For some of them, compilation is expected to fail with a given error message. For others, the resulting parser is expected to produce certain trees. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/readme_grammar/corpus.txt b/spec/fixtures/test_grammars/readme_grammar/corpus.txt new file mode 100644 index 00000000..df339f20 --- /dev/null +++ b/spec/fixtures/test_grammars/readme_grammar/corpus.txt @@ -0,0 +1,13 @@ +================================== +the readme example +================================== + +a + b * c + +--- + +(expression (sum + (expression (variable)) + (expression (product + (expression (variable)) + (expression (variable)))))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/readme_grammar/grammar.json b/spec/fixtures/test_grammars/readme_grammar/grammar.json new file mode 100644 index 00000000..fd496068 --- /dev/null +++ b/spec/fixtures/test_grammars/readme_grammar/grammar.json @@ -0,0 +1,67 @@ +{ + "name": "readme_grammar", + + // Things that can appear anywhere in the language, like comments + // and whitespace, are expressed as 'extras'. + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + + // The first rule listed in the grammar becomes the 'start rule'. + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "product"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "SYMBOL", "name": "variable"}, + { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "("}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ")"} + ] + } + ] + }, + + // Tokens like '+' and '*' are described directly within the + // grammar's rules, as opposed to in a seperate lexer description. + "sum": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + // Ambiguities can be resolved at compile time by assigning precedence + // values to rule subtrees. + "product": { + "type": "PREC_LEFT", + "value": 2, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + // Tokens can be specified using ECMAScript regexps. + "number": {"type": "PATTERN", "value": "\\d+"}, + "comment": {"type": "PATTERN", "value": "#.*"}, + "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt b/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt new file mode 100644 index 00000000..2b028562 --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt @@ -0,0 +1,7 @@ +======================== +the empty string +======================= + +--- + +(first_rule) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json b/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json new file mode 100644 index 00000000..94b6c6c4 --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json @@ -0,0 +1,6 @@ +{ + "name": "start_rule_is_blank", + "rules": { + "first_rule": {"type": "BLANK"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt b/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt new file mode 100644 index 00000000..aaa4e20b --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt @@ -0,0 +1,6 @@ +=========================== +the single token +========================== +the-value +--- +(first_rule) diff --git a/spec/fixtures/test_grammars/start_rule_is_token/grammar.json b/spec/fixtures/test_grammars/start_rule_is_token/grammar.json new file mode 100644 index 00000000..9b60c0d4 --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_token/grammar.json @@ -0,0 +1,6 @@ +{ + "name": "start_rule_is_token", + "rules": { + "first_rule": {"type": "STRING", "value": "the-value"} + } +} \ No newline at end of file diff --git a/spec/helpers/file_helpers.cc b/spec/helpers/file_helpers.cc new file mode 100644 index 00000000..3c08bec2 --- /dev/null +++ b/spec/helpers/file_helpers.cc @@ -0,0 +1,61 @@ +#include "helpers/file_helpers.h" +#include +#include +#include +#include + +using std::string; +using std::ifstream; +using std::istreambuf_iterator; +using std::ofstream; +using std::vector; + +bool file_exists(const string &path) { + struct stat file_stat; + return stat(path.c_str(), &file_stat) == 0; +} + +int get_modified_time(const string &path) { + struct stat file_stat; + if (stat(path.c_str(), &file_stat) != 0) { + if (errno != ENOENT) + fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str()); + return 0; + } + return file_stat.st_mtime; +} + +string read_file(const string &path) { + ifstream file(path); + istreambuf_iterator file_iterator(file), end_iterator; + string content(file_iterator, end_iterator); + file.close(); + return content; +} + +void write_file(const string &path, const string &content) { + ofstream file(path); + file << content; + file.close(); +} + +vector list_directory(const string &path) { + vector result; + + DIR *dir = opendir(path.c_str()); + if (!dir) { + printf("\nTest error - no such directory '%s'", path.c_str()); + return result; + } + + struct dirent *dir_entry; + while ((dir_entry = readdir(dir))) { + string name(dir_entry->d_name); + if (name != "." && name != "..") { + result.push_back(name); + } + } + + closedir(dir); + return result; +} \ No newline at end of file diff --git a/spec/helpers/file_helpers.h b/spec/helpers/file_helpers.h new file mode 100644 index 00000000..c3d798ea --- /dev/null +++ b/spec/helpers/file_helpers.h @@ -0,0 +1,14 @@ +#ifndef HELPERS_FILE_HELPERS_H_ +#define HELPERS_FILE_HELPERS_H_ + +#include +#include +#include + +bool file_exists(const std::string &path); +int get_modified_time(const std::string &path); +std::string read_file(const std::string &path); +void write_file(const std::string &path, const std::string &content); +std::vector list_directory(const std::string &path); + +#endif // HELPERS_FILE_HELPERS_H_ diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index c59eca95..71829c5d 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -1,12 +1,12 @@ #include "spec_helper.h" #include "helpers/load_language.h" +#include "helpers/file_helpers.h" #include #include #include #include #include #include -#include #include #include #include "tree_sitter/compiler.h" @@ -54,25 +54,10 @@ static std::string run_command(const char *cmd, const char *args[]) { } } -static bool file_exists(const string &path) { - struct stat file_stat; - return stat(path.c_str(), &file_stat) == 0; -} - -static int get_modified_time(const string &path) { - struct stat file_stat; - if (stat(path.c_str(), &file_stat) != 0) { - if (errno != ENOENT) - fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str()); - return 0; - } - return file_stat.st_mtime; -} - -const TSLanguage *load_language(const string &source_filename, - const string &lib_filename, - const string &language_name, - string external_scanner_filename = "") { +static const TSLanguage *load_language(const string &source_filename, + const string &lib_filename, + const string &language_name, + string external_scanner_filename = "") { string language_function_name = "tree_sitter_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); @@ -132,9 +117,9 @@ const TSLanguage *load_language(const string &source_filename, return reinterpret_cast(language_function)(); } -const TSLanguage *load_compile_result(const string &name, - const TSCompileResult &compile_result, - string external_scanner_path) { +const TSLanguage *load_test_language(const string &name, + const TSCompileResult &compile_result, + string external_scanner_path) { if (compile_result.error_type != TSCompileErrorTypeNone) { Assert::Failure(string("Compilation failed ") + compile_result.error_message); return nullptr; @@ -155,7 +140,7 @@ const TSLanguage *load_compile_result(const string &name, return language; } -const TSLanguage *get_test_language(const string &language_name) { +const TSLanguage *load_real_language(const string &language_name) { if (loaded_languages[language_name]) return loaded_languages[language_name]; @@ -182,20 +167,14 @@ const TSLanguage *get_test_language(const string &language_name) { if (parser_mtime < grammar_mtime || parser_mtime < libcompiler_mtime) { printf("\n" "Regenerating the %s parser...\n", language_name.c_str()); - ifstream grammar_file(grammar_filename); - istreambuf_iterator grammar_file_iterator(grammar_file), end_iterator; - string grammar_json(grammar_file_iterator, end_iterator); - grammar_file.close(); - + string grammar_json = read_file(grammar_filename); TSCompileResult result = ts_compile_grammar(grammar_json.c_str()); if (result.error_type != TSCompileErrorTypeNone) { fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message); return nullptr; } - ofstream parser_file(parser_filename); - parser_file << result.code; - parser_file.close(); + write_file(parser_filename, result.code); } mkdir("out/tmp", 0777); diff --git a/spec/helpers/load_language.h b/spec/helpers/load_language.h index 41d8b739..c34a33ca 100644 --- a/spec/helpers/load_language.h +++ b/spec/helpers/load_language.h @@ -5,8 +5,10 @@ #include "tree_sitter/runtime.h" #include -const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &, - std::string external_scanner_path = ""); -const TSLanguage *get_test_language(const std::string &language_name); +const TSLanguage *load_real_language(const std::string &name); + +const TSLanguage *load_test_language(const std::string &name, + const TSCompileResult &compile_result, + std::string external_scanner_path = ""); #endif // HELPERS_LOAD_LANGUAGE_H_ diff --git a/spec/helpers/read_test_entries.cc b/spec/helpers/read_test_entries.cc index 970b7c57..e743253f 100644 --- a/spec/helpers/read_test_entries.cc +++ b/spec/helpers/read_test_entries.cc @@ -1,20 +1,18 @@ #include "helpers/read_test_entries.h" +#include #include -#include -#include -#include - #include +#include "helpers/file_helpers.h" + using std::regex; using std::regex_search; using std::regex_replace; -using std::smatch; using std::regex_constants::extended; - +using std::smatch; using std::string; using std::vector; -using std::ifstream; -using std::istreambuf_iterator; + +string fixtures_dir = "spec/fixtures/"; static string trim_output(const string &input) { string result(input); @@ -27,7 +25,7 @@ static string trim_output(const string &input) { static vector parse_test_entries(string content) { regex header_pattern("===+\n" "([^=]+)\n" "===+\n", extended); - regex separator_pattern("---+\n", extended); + regex separator_pattern("---+\r?\n", extended); vector descriptions; vector bodies; @@ -55,51 +53,42 @@ static vector parse_test_entries(string content) { body.substr(0, matches.position() - 1), trim_output(body.substr(matches.position() + matches[0].length())) }); + } else { + puts(("Invalid corpus entry with description: " + descriptions[i]).c_str()); + abort(); } } return result; } -static vector list_directory(string dir_name) { - vector result; - - DIR *dir = opendir(dir_name.c_str()); - if (!dir) { - printf("\nTest error - no such directory '%s'", dir_name.c_str()); - return result; - } - - struct dirent *dir_entry; - while ((dir_entry = readdir(dir))) { - string name(dir_entry->d_name); - if (name != "." && name != "..") - result.push_back(dir_name + "/" + name); - } - - closedir(dir); - return result; -} - -static string read_file(string filename) { - ifstream file(filename); - string result((istreambuf_iterator(file)), istreambuf_iterator()); - return result; -} - -vector read_corpus_entries(string language_name) { +vector read_real_language_corpus(string language_name) { vector result; - string fixtures_dir = "spec/fixtures/"; - string test_directory = fixtures_dir + "grammars/" + language_name + "/grammar_test"; - for (string &test_filename : list_directory(test_directory)) - for (TestEntry &entry : parse_test_entries(read_file(test_filename))) + for (string &test_filename : list_directory(test_directory)) { + for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) { result.push_back(entry); + } + } string error_test_filename = fixtures_dir + "/error_corpus/" + language_name + "_errors.txt"; - for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) + for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) { result.push_back(entry); + } return result; } + +vector read_test_language_corpus(string language_name) { + vector result; + + string test_directory = fixtures_dir + "test_grammars/" + language_name; + for (string &test_filename : list_directory(test_directory)) { + for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) { + result.push_back(entry); + } + } + + return result; +} \ No newline at end of file diff --git a/spec/helpers/read_test_entries.h b/spec/helpers/read_test_entries.h index 69f949fc..3de397f1 100644 --- a/spec/helpers/read_test_entries.h +++ b/spec/helpers/read_test_entries.h @@ -10,6 +10,7 @@ struct TestEntry { std::string tree_string; }; -std::vector read_corpus_entries(std::string directory); +std::vector read_real_language_corpus(std::string name); +std::vector read_test_language_corpus(std::string name); #endif diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc deleted file mode 100644 index ed2109c2..00000000 --- a/spec/integration/compile_grammar_spec.cc +++ /dev/null @@ -1,847 +0,0 @@ -#include "spec_helper.h" -#include "runtime/alloc.h" -#include "helpers/load_language.h" -#include "helpers/stderr_logger.h" -#include "helpers/dedent.h" -#include "compiler/util/string_helpers.h" -#include - -static string fill_template(string input, map parameters) { - string result = input; - for (const auto &pair : parameters) { - util::str_replace(&result, "{{" + pair.first + "}}", pair.second); - } - return result; -} - -START_TEST - -describe("compile_grammar", []() { - TSDocument *document; - - before_each([&]() { - document = ts_document_new(); - }); - - after_each([&]() { - ts_document_free(document); - }); - - auto assert_root_node = [&](const string &expected_string) { - TSNode root_node = ts_document_root_node(document); - char *node_string = ts_node_string(root_node, document); - AssertThat(node_string, Equals(expected_string)); - ts_free(node_string); - }; - - describe("conflicts", [&]() { - it("can resolve shift/reduce conflicts using associativities", [&]() { - string grammar_template = R"JSON({ - "name": "associativity_example", - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "math_operation"}, - {"type": "SYMBOL", "name": "identifier"} - ] - }, - - "math_operation": { - "type": "{{math_operation_prec_type}}", - "value": 0, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"; - - // Ambiguity, which '+' applies first? - ts_document_set_input_string(document, "x+y+z"); - - TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, { - {"math_operation_prec_type", "PREC"} - }).c_str()); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - Unresolved conflict for symbol sequence: - - expression '+' expression • '+' … - - Possible interpretations: - - 1: (math_operation expression '+' expression) • '+' … - 2: expression '+' (math_operation expression • '+' expression) - - Possible resolutions: - - 1: Specify a left or right associativity in `math_operation` - 2: Add a conflict for these rules: `math_operation` - )MESSAGE"))); - - result = ts_compile_grammar(fill_template(grammar_template, { - {"math_operation_prec_type", "PREC_LEFT"} - }).c_str()); - - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_parse(document); - assert_root_node("(expression (math_operation " - "(expression (math_operation (expression (identifier)) (expression (identifier)))) " - "(expression (identifier))))"); - - result = ts_compile_grammar(fill_template(grammar_template, { - {"math_operation_prec_type", "PREC_RIGHT"} - }).c_str()); - - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_parse(document); - assert_root_node("(expression (math_operation " - "(expression (identifier)) " - "(expression (math_operation (expression (identifier)) (expression (identifier))))))"); - }); - - it("can resolve shift/reduce conflicts involving single-child rules using precedence", [&]() { - string grammar_template = R"JSON({ - "name": "associativity_example", - - "extras": [ - {"type": "PATTERN", "value": "\\s"} - ], - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "function_call"}, - {"type": "SYMBOL", "name": "identifier"} - ] - }, - - "function_call": { - "type": "PREC_RIGHT", - "value": {{function_call_precedence}}, - "content": { - "type": "CHOICE", - "members": [ - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "expression"} - ] - }, - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "block"} - ] - }, - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "block"} - ] - } - ] - } - }, - - "block": { - "type": "SEQ", - "members": [ - {"type": "STRING", "value": "{"}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "}"} - ] - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"; - - // Ambiguity: is the trailing block associated with `bar` or `foo`? - ts_document_set_input_string(document, "foo bar { baz }"); - - TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, { - {"function_call_precedence", "0"} - }).c_str()); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - Unresolved conflict for symbol sequence: - - identifier • '{' … - - Possible interpretations: - - 1: (expression identifier) • '{' … - 2: (function_call identifier • block) - - Possible resolutions: - - 1: Specify a higher precedence in `function_call` than in the other rules. - 2: Specify a higher precedence in `expression` than in the other rules. - 3: Specify a left or right associativity in `expression` - 4: Add a conflict for these rules: `expression` `function_call` - )MESSAGE"))); - - // Giving function calls lower precedence than expressions causes `bar` - // to be treated as an expression passed to `foo`, not as a function - // that's being called with a block. - result = ts_compile_grammar(fill_template(grammar_template, { - {"function_call_precedence", "-1"} - }).c_str()); - - AssertThat(result.error_message, IsNull()); - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (identifier)) " - "(block (expression (identifier)))))"); - - // Giving function calls higher precedence than expressions causes `bar` - // to be treated as a function that's being called with a block, not as - // an expression passed to `foo`. - result = ts_compile_grammar(fill_template(grammar_template, { - {"function_call_precedence", "1"} - }).c_str()); - - AssertThat(result.error_message, IsNull()); - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_set_input_string(document, "foo bar { baz }"); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (function_call " - "(identifier) " - "(block (expression (identifier)))))))"); - }); - - it("handles precedence applied to specific rule subsequences (regression)", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON({ - "name": "precedence_on_subsequence", - - "extras": [ - {"type": "STRING", "value": " "} - ], - - "rules": { - "expression": { - "type": "PREC_LEFT", - "value": 0, - "content": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "function_call"}, - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "scope_resolution"} - ] - } - }, - - "function_call": { - "type": "CHOICE", - "members": [ - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "expression"} - ] - }, - - { - "type": "PREC", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "block"} - ] - } - }, - - { - "type": "PREC", - "value": -1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "do_block"} - ] - } - }, - - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - { - "type": "PREC", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "block"} - ] - } - } - ] - }, - - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - { - "type": "PREC", - "value": -1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "do_block"} - ] - } - } - ] - } - ] - }, - - "scope_resolution": { - "type": "PREC_LEFT", - "value": 1, - "content": { - "type": "CHOICE", - "members": [ - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "::"}, - {"type": "SYMBOL", "name": "expression"} - ] - }, - { - "type": "SEQ", - "members": [ - {"type": "STRING", "value": "::"}, - {"type": "SYMBOL", "name": "expression"}, - ] - } - ] - } - }, - - "block": { - "type": "STRING", - "value": "{}" - }, - - "do_block": { - "type": "STRING", - "value": "do end" - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"); - - auto language = load_compile_result("precedence_on_subsequence", result); - ts_document_set_language(document, language); - - ts_document_set_input_string(document, "a b {}"); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (function_call (identifier) (block)))))"); - - ts_document_set_input_string(document, "a b do end"); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (identifier)) " - "(do_block)))"); - }); - - it("does not allow conflicting precedences", [&]() { - string grammar_template = R"JSON({ - "name": "conflicting_precedence_example", - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "sum"}, - {"type": "SYMBOL", "name": "product"}, - {"type": "SYMBOL", "name": "other_thing"} - ] - }, - - "sum": { - "type": "PREC_LEFT", - "value": 0, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "product": { - "type": "PREC_LEFT", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "*"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "other_thing": { - "type": "PREC_LEFT", - "value": -1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "*"}, - {"type": "STRING", "value": "*"} - ] - } - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, { - }).c_str()); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - Unresolved conflict for symbol sequence: - - expression '+' expression • '*' … - - Possible interpretations: - - 1: (sum expression '+' expression) • '*' … - 2: expression '+' (product expression • '*' expression) - 3: expression '+' (other_thing expression • '*' '*') - - Possible resolutions: - - 1: Specify a higher precedence in `product` and `other_thing` than in the other rules. - 2: Specify a higher precedence in `sum` than in the other rules. - 3: Add a conflict for these rules: `sum` `product` `other_thing` - )MESSAGE"))); - }); - }); - - describe("when the grammar contains rules that match the empty string", [&]() { - it("reports an error", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "empty_rules", - - "rules": { - "rule_1": {"type": "SYMBOL", "name": "rule_2"}, - - "rule_2": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "rule_1"}, - {"type": "BLANK"} - ] - } - } - } - )JSON"); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - The rule `rule_2` matches the empty string. - Tree-sitter currently does not support syntactic rules that match the empty string. - )MESSAGE"))); - }); - }); - - describe("external scanners", [&]() { - it("can tokenize using arbitrary user-defined scanner functions", [&]() { - string grammar = R"JSON({ - "name": "external_scanner_example", - - "externals": [ - "_percent_string", - "_percent_string_start", - "_percent_string_end" - ], - - "extras": [ - {"type": "PATTERN", "value": "\\s"} - ], - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "string"}, - {"type": "SYMBOL", "name": "sum"}, - {"type": "SYMBOL", "name": "identifier"} - ] - }, - - "sum": { - "type": "PREC_LEFT", - "value": 0, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "string": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "_percent_string"}, - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "_percent_string_start"}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "_percent_string_end"} - ] - }, - ] - }, - - "identifier": { - "type": "PATTERN", - "value": "\\a+" - } - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(grammar.c_str()); - AssertThat(result.error_message, IsNull()); - - ts_document_set_language(document, load_compile_result( - "external_scanner_example", - result, - "spec/fixtures/external_scanners/percent_strings.c" - )); - - ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); - ts_document_parse(document); - assert_root_node("(expression (sum (expression (identifier)) (expression (string))))"); - - ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}"); - ts_document_parse(document); - assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); - }); - - it("allows external scanners to refer to tokens that are defined internally", [&]() { - string grammar = R"JSON({ - "name": "shared_external_tokens", - - "externals": [ - "string", - "line_break" - ], - - "extras": [ - {"type": "PATTERN", "value": "\\s"} - ], - - "rules": { - "statement": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "_expression"}, - {"type": "SYMBOL", "name": "_expression"}, - {"type": "SYMBOL", "name": "line_break"} - ] - }, - - "_expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "string"}, - {"type": "SYMBOL", "name": "variable"}, - {"type": "SYMBOL", "name": "number"} - ] - }, - - "variable": {"type": "PATTERN", "value": "\\a+"}, - "number": {"type": "PATTERN", "value": "\\d+"}, - "line_break": {"type": "STRING", "value": "\n"} - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(grammar.c_str()); - AssertThat(result.error_message, IsNull()); - - ts_document_set_language(document, load_compile_result( - "shared_external_tokens", - result, - "spec/fixtures/external_scanners/shared_external_tokens.c" - )); - - ts_document_set_input_string(document, "a b\n"); - ts_document_parse(document); - assert_root_node("(statement (variable) (variable) (line_break))"); - - ts_document_set_input_string(document, "a \nb\n"); - ts_document_parse(document); - assert_root_node("(statement (variable) (variable) (line_break))"); - - ts_document_set_input_string(document, "'hello' 'world'\n"); - ts_document_parse(document); - assert_root_node("(statement (string) (string) (line_break))"); - - ts_document_set_input_string(document, "'hello' \n'world'\n"); - ts_document_parse(document); - assert_root_node("(statement (string) (string) (line_break))"); - }); - - it("allows external tokens to be used as extras", [&]() { - string grammar = R"JSON({ - "name": "extra_external_tokens", - - "externals": [ - "comment" - ], - - "extras": [ - {"type": "PATTERN", "value": "\\s"}, - {"type": "SYMBOL", "name": "comment"} - ], - - "rules": { - "assignment": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "variable"}, - {"type": "STRING", "value": "="}, - {"type": "SYMBOL", "name": "variable"} - ] - }, - - "variable": {"type": "PATTERN", "value": "\\a+"} - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(grammar.c_str()); - AssertThat(result.error_message, IsNull()); - - ts_document_set_language(document, load_compile_result( - "extra_external_tokens", - result, - "spec/fixtures/external_scanners/extra_external_tokens.c" - )); - - ts_document_set_input_string(document, "x = # a comment\n y"); - ts_document_parse(document); - assert_root_node("(assignment (variable) (comment) (variable))"); - }); - }); - - describe("when the grammar's start symbol is a token", [&]() { - it("parses the token", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "one_token_language", - "rules": { - "first_rule": {"type": "STRING", "value": "the-value"} - } - } - )JSON"); - - ts_document_set_language(document, load_compile_result("one_token_language", result)); - - ts_document_set_input_string(document, "the-value"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - }); - }); - - describe("when the grammar's start symbol is blank", [&]() { - it("parses the empty string", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "blank_language", - "rules": { - "first_rule": {"type": "BLANK"} - } - } - )JSON"); - - ts_document_set_language(document, load_compile_result("blank_language", result)); - - ts_document_set_input_string(document, ""); - ts_document_parse(document); - assert_root_node("(first_rule)"); - }); - }); - - describe("when the grammar contains anonymous tokens with escaped characters", [&]() { - it("escapes the escaped characters properly in the generated parser", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "escaped_char_language", - "rules": { - "first_rule": { - "type": "CHOICE", - "members": [ - {"type": "STRING", "value": "\n"}, - {"type": "STRING", "value": "\r"}, - {"type": "STRING", "value": "'hello'"}, - {"type": "PATTERN", "value": "\\d+"} - ] - } - } - } - )JSON"); - - ts_document_set_language(document, load_compile_result("escaped_char_language", result)); - - ts_document_set_input_string(document, "1234"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - - ts_document_set_input_string(document, "\n"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - - ts_document_set_input_string(document, "'hello'"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - }); - }); - - describe("the grammar in the README", [&]() { - it("parses the input in the README", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "arithmetic", - - // Things that can appear anywhere in the language, like comments - // and whitespace, are expressed as 'extras'. - "extras": [ - {"type": "PATTERN", "value": "\\s"}, - {"type": "SYMBOL", "name": "comment"} - ], - - "rules": { - - // The first rule listed in the grammar becomes the 'start rule'. - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "sum"}, - {"type": "SYMBOL", "name": "product"}, - {"type": "SYMBOL", "name": "number"}, - {"type": "SYMBOL", "name": "variable"}, - { - "type": "SEQ", - "members": [ - {"type": "STRING", "value": "("}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": ")"} - ] - } - ] - }, - - // Tokens like '+' and '*' are described directly within the - // grammar's rules, as opposed to in a seperate lexer description. - "sum": { - "type": "PREC_LEFT", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - // Ambiguities can be resolved at compile time by assigning precedence - // values to rule subtrees. - "product": { - "type": "PREC_LEFT", - "value": 2, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "*"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - // Tokens can be specified using ECMAScript regexps. - "number": {"type": "PATTERN", "value": "\\d+"}, - "comment": {"type": "PATTERN", "value": "#.*"}, - "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"} - } - } - )JSON"); - - const TSLanguage *language = load_compile_result("arithmetic", result); - - ts_document_set_language(document, language); - ts_document_set_input_string(document, "a + b * c"); - ts_document_parse(document); - - assert_root_node( - "(expression (sum " - "(expression (variable)) " - "(expression (product " - "(expression (variable)) " - "(expression (variable))))))"); - }); - }); -}); - -END_TEST diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc deleted file mode 100644 index c399e8f9..00000000 --- a/spec/integration/corpus_specs.cc +++ /dev/null @@ -1,185 +0,0 @@ -#include "spec_helper.h" -#include "runtime/alloc.h" -#include "helpers/load_language.h" -#include "helpers/read_test_entries.h" -#include "helpers/spy_input.h" -#include "helpers/stderr_logger.h" -#include "helpers/point_helpers.h" -#include "helpers/encoding_helpers.h" -#include "helpers/record_alloc.h" -#include "helpers/random_helpers.h" -#include "helpers/scope_sequence.h" -#include - -static void assert_correct_tree_shape(const TSDocument *document, string tree_string) { - TSNode root_node = ts_document_root_node(document); - const char *node_string = ts_node_string(root_node, document); - string result(node_string); - ts_free((void *)node_string); - AssertThat(result, Equals(tree_string)); -} - -static void assert_consistent_sizes(TSNode node) { - size_t child_count = ts_node_child_count(node); - size_t start_byte = ts_node_start_byte(node); - size_t end_byte = ts_node_end_byte(node); - TSPoint start_point = ts_node_start_point(node); - TSPoint end_point = ts_node_end_point(node); - bool some_child_has_changes = false; - - AssertThat(start_byte, !IsGreaterThan(end_byte)); - AssertThat(start_point, !IsGreaterThan(end_point)); - - size_t last_child_end_byte = start_byte; - TSPoint last_child_end_point = start_point; - - for (size_t i = 0; i < child_count; i++) { - TSNode child = ts_node_child(node, i); - size_t child_start_byte = ts_node_start_byte(child); - TSPoint child_start_point = ts_node_start_point(child); - - AssertThat(child_start_byte, !IsLessThan(last_child_end_byte)); - AssertThat(child_start_point, !IsLessThan(last_child_end_point)); - assert_consistent_sizes(child); - if (ts_node_has_changes(child)) - some_child_has_changes = true; - - last_child_end_byte = ts_node_end_byte(child); - last_child_end_point = ts_node_end_point(child); - } - - if (child_count > 0) { - AssertThat(end_byte, !IsLessThan(last_child_end_byte)); - AssertThat(end_point, !IsLessThan(last_child_end_point)); - } - - if (some_child_has_changes) { - AssertThat(ts_node_has_changes(node), IsTrue()); - } -} - -static void assert_correct_tree_size(TSDocument *document, string content) { - TSNode root_node = ts_document_root_node(document); - size_t expected_size = content.size(); - - // In the JSON grammar, the start rule (`_value`) is hidden, so the node - // returned from `ts_document_root_node` (e.g. an `object` node), does not - // actually point to the root of the tree. In this weird case, trailing - // whitespace is not included in the root node's size. - // - // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden? - if (ts_document_language(document) == get_test_language("json") && - string(ts_node_type(root_node, document)) != "ERROR") - expected_size = content.find_last_not_of("\n ") + 1; - - AssertThat(ts_node_end_byte(root_node), Equals(expected_size)); - assert_consistent_sizes(root_node); -} - -START_TEST - -describe("The Corpus", []() { - vector test_languages({ - "javascript", - "json", - "c", - "cpp", - "python", - }); - - for (auto &language_name : test_languages) { - describe(("the " + language_name + " language").c_str(), [&]() { - TSDocument *document; - - before_each([&]() { - record_alloc::start(); - document = ts_document_new(); - ts_document_set_language(document, get_test_language(language_name)); - - // ts_document_set_logger(document, stderr_logger_new(true)); - // ts_document_print_debugging_graphs(document, true); - }); - - after_each([&]() { - ts_document_free(document); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - for (auto &entry : read_corpus_entries(language_name)) { - SpyInput *input; - - auto it_handles_edit_sequence = [&](string name, std::function edit_sequence){ - it(("parses " + entry.description + ": " + name).c_str(), [&]() { - input = new SpyInput(entry.input, 3); - ts_document_set_input(document, input->input()); - edit_sequence(); - assert_correct_tree_shape(document, entry.tree_string); - assert_correct_tree_size(document, input->content); - delete input; - }); - }; - - it_handles_edit_sequence("initial parse", [&]() { - ts_document_parse(document); - }); - - std::set> deletions; - std::set> insertions; - - for (size_t i = 0; i < 60; i++) { - size_t edit_position = random() % utf8_char_count(entry.input); - size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position); - string inserted_text = random_words(random() % 4 + 1); - - if (insertions.insert({edit_position, inserted_text}).second) { - string description = "\"" + inserted_text + "\" at " + to_string(edit_position); - - it_handles_edit_sequence("repairing an insertion of " + description, [&]() { - ts_document_edit(document, input->replace(edit_position, 0, inserted_text)); - ts_document_parse(document); - assert_correct_tree_size(document, input->content); - - ts_document_edit(document, input->undo()); - assert_correct_tree_size(document, input->content); - - TSRange *ranges; - uint32_t range_count; - ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); - ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); - - ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); - verify_changed_ranges(old_scope_sequence, new_scope_sequence, - input->content, ranges, range_count); - ts_free(ranges); - }); - } - - if (deletions.insert({edit_position, deletion_size}).second) { - string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size); - - it_handles_edit_sequence("repairing a deletion of " + desription, [&]() { - ts_document_edit(document, input->replace(edit_position, deletion_size, "")); - ts_document_parse(document); - assert_correct_tree_size(document, input->content); - - ts_document_edit(document, input->undo()); - assert_correct_tree_size(document, input->content); - - TSRange *ranges; - uint32_t range_count; - ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); - ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); - - ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); - verify_changed_ranges(old_scope_sequence, new_scope_sequence, - input->content, ranges, range_count); - ts_free(ranges); - }); - } - } - } - }); - } -}); - -END_TEST diff --git a/spec/integration/real_grammars.cc b/spec/integration/real_grammars.cc new file mode 100644 index 00000000..a7c2137d --- /dev/null +++ b/spec/integration/real_grammars.cc @@ -0,0 +1,181 @@ +#include "spec_helper.h" +#include "runtime/alloc.h" +#include "helpers/load_language.h" +#include "helpers/read_test_entries.h" +#include "helpers/spy_input.h" +#include "helpers/stderr_logger.h" +#include "helpers/point_helpers.h" +#include "helpers/encoding_helpers.h" +#include "helpers/record_alloc.h" +#include "helpers/random_helpers.h" +#include "helpers/scope_sequence.h" +#include + +static void assert_consistent_sizes(TSNode node) { + size_t child_count = ts_node_child_count(node); + size_t start_byte = ts_node_start_byte(node); + size_t end_byte = ts_node_end_byte(node); + TSPoint start_point = ts_node_start_point(node); + TSPoint end_point = ts_node_end_point(node); + bool some_child_has_changes = false; + + AssertThat(start_byte, !IsGreaterThan(end_byte)); + AssertThat(start_point, !IsGreaterThan(end_point)); + + size_t last_child_end_byte = start_byte; + TSPoint last_child_end_point = start_point; + + for (size_t i = 0; i < child_count; i++) { + TSNode child = ts_node_child(node, i); + size_t child_start_byte = ts_node_start_byte(child); + TSPoint child_start_point = ts_node_start_point(child); + + AssertThat(child_start_byte, !IsLessThan(last_child_end_byte)); + AssertThat(child_start_point, !IsLessThan(last_child_end_point)); + assert_consistent_sizes(child); + if (ts_node_has_changes(child)) + some_child_has_changes = true; + + last_child_end_byte = ts_node_end_byte(child); + last_child_end_point = ts_node_end_point(child); + } + + if (child_count > 0) { + AssertThat(end_byte, !IsLessThan(last_child_end_byte)); + AssertThat(end_point, !IsLessThan(last_child_end_point)); + } + + if (some_child_has_changes) { + AssertThat(ts_node_has_changes(node), IsTrue()); + } +} + +static void assert_correct_tree_size(TSDocument *document, string content) { + TSNode root_node = ts_document_root_node(document); + size_t expected_size = content.size(); + + // In the JSON grammar, the start rule (`_value`) is hidden, so the node + // returned from `ts_document_root_node` (e.g. an `object` node), does not + // actually point to the root of the tree. In this weird case, trailing + // whitespace is not included in the root node's size. + // + // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden? + if (ts_document_language(document) == load_real_language("json") && + string(ts_node_type(root_node, document)) != "ERROR") + expected_size = content.find_last_not_of("\n ") + 1; + + AssertThat(ts_node_end_byte(root_node), Equals(expected_size)); + assert_consistent_sizes(root_node); +} + +START_TEST + +vector test_languages({ + "javascript", + "json", + "c", + "cpp", + "python", +}); + +for (auto &language_name : test_languages) { + describe(("the " + language_name + " language").c_str(), [&]() { + TSDocument *document; + + before_each([&]() { + record_alloc::start(); + document = ts_document_new(); + ts_document_set_language(document, load_real_language(language_name)); + + // ts_document_set_logger(document, stderr_logger_new(true)); + // ts_document_print_debugging_graphs(document, true); + }); + + after_each([&]() { + ts_document_free(document); + AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); + }); + + for (auto &entry : read_real_language_corpus(language_name)) { + SpyInput *input; + + auto it_handles_edit_sequence = [&](string name, std::function edit_sequence){ + it(("parses " + entry.description + ": " + name).c_str(), [&]() { + input = new SpyInput(entry.input, 3); + ts_document_set_input(document, input->input()); + edit_sequence(); + + TSNode root_node = ts_document_root_node(document); + const char *node_string = ts_node_string(root_node, document); + string result(node_string); + ts_free((void *)node_string); + AssertThat(result, Equals(entry.tree_string)); + + assert_correct_tree_size(document, input->content); + delete input; + }); + }; + + it_handles_edit_sequence("initial parse", [&]() { + ts_document_parse(document); + }); + + std::set> deletions; + std::set> insertions; + + for (size_t i = 0; i < 60; i++) { + size_t edit_position = random() % utf8_char_count(entry.input); + size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position); + string inserted_text = random_words(random() % 4 + 1); + + if (insertions.insert({edit_position, inserted_text}).second) { + string description = "\"" + inserted_text + "\" at " + to_string(edit_position); + + it_handles_edit_sequence("repairing an insertion of " + description, [&]() { + ts_document_edit(document, input->replace(edit_position, 0, inserted_text)); + ts_document_parse(document); + assert_correct_tree_size(document, input->content); + + ts_document_edit(document, input->undo()); + assert_correct_tree_size(document, input->content); + + TSRange *ranges; + uint32_t range_count; + ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); + ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); + + ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); + verify_changed_ranges(old_scope_sequence, new_scope_sequence, + input->content, ranges, range_count); + ts_free(ranges); + }); + } + + if (deletions.insert({edit_position, deletion_size}).second) { + string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size); + + it_handles_edit_sequence("repairing a deletion of " + desription, [&]() { + ts_document_edit(document, input->replace(edit_position, deletion_size, "")); + ts_document_parse(document); + assert_correct_tree_size(document, input->content); + + ts_document_edit(document, input->undo()); + assert_correct_tree_size(document, input->content); + + TSRange *ranges; + uint32_t range_count; + ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); + ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); + + ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); + verify_changed_ranges(old_scope_sequence, new_scope_sequence, + input->content, ranges, range_count); + ts_free(ranges); + }); + } + } + } + }); +} + +END_TEST diff --git a/spec/integration/test_grammars.cc b/spec/integration/test_grammars.cc new file mode 100644 index 00000000..128dd6cc --- /dev/null +++ b/spec/integration/test_grammars.cc @@ -0,0 +1,78 @@ +#include "spec_helper.h" +#include "helpers/read_test_entries.h" +#include "helpers/load_language.h" +#include "helpers/stderr_logger.h" +#include "helpers/file_helpers.h" +#include "runtime/alloc.h" + +START_TEST + +string grammars_dir_path = "spec/fixtures/test_grammars"; +vector test_languages = list_directory(grammars_dir_path); + +for (auto &language_name : test_languages) { + if (language_name == "readme.md") continue; + + describe(("test language: " + language_name).c_str(), [&]() { + string directory_path = grammars_dir_path + "/" + language_name; + string grammar_path = directory_path + "/grammar.json"; + string external_scanner_path = directory_path + "/scanner.c"; + string expected_error_path = directory_path + "/expected_error.txt"; + string corpus_path = directory_path + "/corpus.txt"; + + if (!file_exists(external_scanner_path)) { + external_scanner_path = ""; + } + + string grammar_json = read_file(grammar_path); + TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str()); + + if (file_exists(expected_error_path)) { + it("fails with the correct error message", [&]() { + string expected_error = read_file(expected_error_path); + AssertThat((void *)compile_result.error_message, !IsNull()); + AssertThat(compile_result.error_message, Equals(expected_error)); + }); + + return; + } else { + TSDocument *document = nullptr; + const TSLanguage *language = nullptr; + + before_each([&]() { + if (!language) { + language = load_test_language( + language_name, + compile_result, + external_scanner_path + ); + } + + document = ts_document_new(); + ts_document_set_language(document, language); + + // ts_document_set_logger(document, stderr_logger_new(true)); + // ts_document_print_debugging_graphs(document, true); + }); + + after_each([&]() { + if (document) ts_document_free(document); + }); + + for (auto &entry : read_test_language_corpus(language_name)) { + it(("parses " + entry.description).c_str(), [&]() { + ts_document_set_input_string_with_length(document, entry.input.c_str(), entry.input.size()); + ts_document_parse(document); + + TSNode root_node = ts_document_root_node(document); + const char *node_string = ts_node_string(root_node, document); + string result(node_string); + ts_free((void *)node_string); + AssertThat(result, Equals(entry.tree_string)); + }); + } + } + }); +} + +END_TEST \ No newline at end of file diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index f80419dc..1863e210 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -43,7 +43,7 @@ describe("Document", [&]() { before_each([&]() { spy_input = new SpyInput("{\"key\": [null, 2]}", 3); - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_set_input_string(document, "{\"key\": [1, 2]}"); ts_document_parse(document); @@ -152,7 +152,7 @@ describe("Document", [&]() { }); it("uses the given language for future parses", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_parse(document); root = ts_document_root_node(document); @@ -162,10 +162,10 @@ describe("Document", [&]() { }); it("clears out any previous tree", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_parse(document); - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); AssertThat(ts_document_root_node(document).data, Equals(nullptr)); ts_document_parse(document); @@ -177,7 +177,7 @@ describe("Document", [&]() { }); it("does not allow setting a language with a different version number", [&]() { - TSLanguage language = *get_test_language("json"); + TSLanguage language = *load_real_language("json"); AssertThat(ts_language_version(&language), Equals(TREE_SITTER_LANGUAGE_VERSION)); language.version++; @@ -193,7 +193,7 @@ describe("Document", [&]() { before_each([&]() { logger = new SpyLogger(); - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_set_input_string(document, "[1, 2]"); }); @@ -235,7 +235,7 @@ describe("Document", [&]() { SpyInput *input; before_each([&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); input = new SpyInput("{a: null};", 3); ts_document_set_input(document, input->input()); ts_document_parse(document); diff --git a/spec/runtime/node_spec.cc b/spec/runtime/node_spec.cc index 085e4d31..f01a862f 100644 --- a/spec/runtime/node_spec.cc +++ b/spec/runtime/node_spec.cc @@ -40,7 +40,7 @@ describe("Node", []() { record_alloc::start(); document = ts_document_new(); - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_set_input_string(document, input_string.c_str()); ts_document_parse(document); diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index 0b4c0a3a..c1c3a547 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -83,7 +83,7 @@ describe("Parser", [&]() { describe("handling errors", [&]() { describe("when there is an invalid substring right before a valid token", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, @@@@@, true]"); assert_root_node( @@ -108,7 +108,7 @@ describe("Parser", [&]() { describe("when there is an unexpected string in the middle of a token", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, faaaaalse, true]"); assert_root_node( @@ -134,7 +134,7 @@ describe("Parser", [&]() { describe("when there is one unexpected token between two valid tokens", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, true false, true]"); assert_root_node( @@ -153,7 +153,7 @@ describe("Parser", [&]() { describe("when there is an unexpected string at the end of a token", [&]() { it("computes the error's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, \"hi\n, true]"); assert_root_node( @@ -163,7 +163,7 @@ describe("Parser", [&]() { describe("when there is an unterminated error", [&]() { it("maintains a consistent tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("a; /* b"); assert_root_node( "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); @@ -172,7 +172,7 @@ describe("Parser", [&]() { describe("when there are extra tokens at the end of the viable prefix", [&]() { it("does not include them in the error node", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text( "var x;\n" "\n" @@ -192,7 +192,7 @@ describe("Parser", [&]() { describe("handling extra tokens", [&]() { describe("when the token appears as part of a grammar rule", [&]() { it("incorporates it into the tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("fn()\n"); assert_root_node( @@ -202,7 +202,7 @@ describe("Parser", [&]() { describe("when the token appears somewhere else", [&]() { it("incorporates it into the tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text( "fn()\n" " .otherFn();"); @@ -218,7 +218,7 @@ describe("Parser", [&]() { describe("when several extra tokens appear in a row", [&]() { it("incorporates them into the tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text( "fn()\n\n" "// This is a comment" @@ -239,7 +239,7 @@ describe("Parser", [&]() { describe("editing", [&]() { describe("creating new tokens near the end of the input", [&]() { it("updates the parse tree and re-reads only the changed portion of the text", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("x * (100 + abc);"); assert_root_node( @@ -262,7 +262,7 @@ describe("Parser", [&]() { it("updates the parse tree and re-reads only the changed portion of the input", [&]() { chunk_size = 2; - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("123 + 456 * (10 + x);"); assert_root_node( @@ -285,7 +285,7 @@ describe("Parser", [&]() { describe("introducing an error", [&]() { it("gives the error the right size", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("var x = y;"); assert_root_node( @@ -308,7 +308,7 @@ describe("Parser", [&]() { describe("into the middle of an existing token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -327,7 +327,7 @@ describe("Parser", [&]() { describe("at the end of an existing token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -346,7 +346,7 @@ describe("Parser", [&]() { describe("inserting text into a node containing a extra token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("123 *\n" "// a-comment\n" "abc;"); @@ -373,7 +373,7 @@ describe("Parser", [&]() { describe("when a critical token is removed", [&]() { it("updates the parse tree, creating an error", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("123 * 456; 789 * 123;"); assert_root_node( @@ -392,7 +392,7 @@ describe("Parser", [&]() { describe("with external tokens", [&]() { it("maintains the external scanner's state during incremental parsing", [&]() { - ts_document_set_language(document, get_test_language("python")); + ts_document_set_language(document, load_real_language("python")); string text = dedent(R"PYTHON( if a: print b @@ -420,7 +420,7 @@ describe("Parser", [&]() { }); it("does not try to re-use nodes that are within the edited region", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("{ x: (b.c) };"); assert_root_node( @@ -435,7 +435,7 @@ describe("Parser", [&]() { }); it("updates the document's parse count", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); AssertThat(ts_document_parse_count(document), Equals(0)); set_text("{ x: (b.c) };"); @@ -449,7 +449,7 @@ describe("Parser", [&]() { describe("lexing", [&]() { describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { it("terminates them at the end of the document", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("x; // this is a comment"); assert_root_node( @@ -464,7 +464,7 @@ describe("Parser", [&]() { it("recognizes UTF8 characters as single characters", [&]() { // 'ΩΩΩ — ΔΔ'; - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"); assert_root_node(