Change repeat to mean zero-or-more

This commit is contained in:
Max Brunsfeld 2014-02-15 15:43:32 -08:00
parent bf07522026
commit 5c1a0982df
12 changed files with 219 additions and 155 deletions

View file

@ -9,8 +9,8 @@ extern "C" {
#include <stdio.h>
#include <string.h>
// #define TS_DEBUG_PARSE
// #define TS_DEBUG_LEX
//#define TS_DEBUG_PARSE
//#define TS_DEBUG_LEX
#ifdef TS_DEBUG_LEX
#define DEBUG_LEX(...) fprintf(stderr, __VA_ARGS__)

View file

@ -105,12 +105,12 @@ describe("preparing a grammar", []() {
sym("y")
}) },
}, {
{ "repeat_helper1", seq({
seq({ sym("a"), sym("b") }),
choice({
{ "repeat_helper1", choice({
seq({
seq({ sym("a"), sym("b") }),
aux_sym("repeat_helper1"),
blank(),
}),
blank(),
}) }
})));
});

View file

@ -5,12 +5,12 @@ using namespace tree_sitter;
using namespace rules;
static rule_ptr comma_sep(const rule_ptr &rule) {
return seq({
rule,
choice({
return choice({
seq({
rule,
repeat(seq({ aux_sym("comma"), rule })),
blank(),
}),
blank(),
});
}

View file

@ -147,6 +147,24 @@ static void ts_lex(TSParser *parser) {
case 20:
ACCEPT_TOKEN(ts_aux_colon);
case 21:
if (LOOKAHEAD_CHAR() == '\"')
ADVANCE(10);
if (LOOKAHEAD_CHAR() == '}')
ADVANCE(4);
LEX_ERROR(2, EXPECT({"\"", "}"}));
case 22:
if (LOOKAHEAD_CHAR() == '\"')
ADVANCE(10);
if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9')
ADVANCE(16);
if (LOOKAHEAD_CHAR() == '[')
ADVANCE(17);
if (LOOKAHEAD_CHAR() == ']')
ADVANCE(7);
if (LOOKAHEAD_CHAR() == '{')
ADVANCE(18);
LEX_ERROR(5, EXPECT({"\"", "0-9", "[", "]", "{"}));
case 23:
if (LOOKAHEAD_CHAR() == '\"')
ADVANCE(10);
LEX_ERROR(1, EXPECT({"\""}));
@ -200,6 +218,8 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_string:
SHIFT(4);
case ts_aux_right_brace:
SHIFT(43);
default:
PARSE_PANIC();
}
@ -227,7 +247,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_aux_left_brace:
SHIFT(13);
case ts_aux_left_bracket:
SHIFT(20);
SHIFT(19);
default:
PARSE_PANIC();
}
@ -249,12 +269,12 @@ static TSParseResult ts_parse(const char *input) {
case ts_aux_repeat_helper2:
SHIFT(41);
case ts_aux_right_brace:
SHIFT(43);
REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
case 8:
SET_LEX_STATE(21);
SET_LEX_STATE(23);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_string:
SHIFT(9);
@ -285,7 +305,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_aux_left_brace:
SHIFT(13);
case ts_aux_left_bracket:
SHIFT(20);
SHIFT(19);
default:
PARSE_PANIC();
}
@ -297,7 +317,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_aux_repeat_helper2:
SHIFT(12);
case ts_aux_right_brace:
REDUCE(ts_aux_repeat_helper2, 4, COLLAPSE({1, 0, 1, 0}));
REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
@ -314,6 +334,8 @@ static TSParseResult ts_parse(const char *input) {
switch (LOOKAHEAD_SYM()) {
case ts_symbol_string:
SHIFT(14);
case ts_aux_right_brace:
SHIFT(40);
default:
PARSE_PANIC();
}
@ -341,7 +363,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_aux_left_brace:
SHIFT(13);
case ts_aux_left_bracket:
SHIFT(20);
SHIFT(19);
default:
PARSE_PANIC();
}
@ -353,7 +375,7 @@ static TSParseResult ts_parse(const char *input) {
case ts_aux_repeat_helper2:
SHIFT(17);
case ts_aux_right_brace:
SHIFT(19);
REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
@ -376,32 +398,34 @@ static TSParseResult ts_parse(const char *input) {
PARSE_PANIC();
}
case 19:
SET_LEX_STATE(2);
SET_LEX_STATE(22);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1}));
case ts_aux_right_brace:
REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1}));
case ts_symbol_array:
SHIFT(20);
case ts_symbol_number:
SHIFT(20);
case ts_symbol_object:
SHIFT(20);
case ts_symbol_string:
SHIFT(20);
case ts_symbol_value:
SHIFT(21);
case ts_aux_left_brace:
SHIFT(25);
case ts_aux_left_bracket:
SHIFT(32);
case ts_aux_right_bracket:
SHIFT(39);
default:
PARSE_PANIC();
}
case 20:
SET_LEX_STATE(9);
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_array:
SHIFT(21);
case ts_symbol_number:
SHIFT(21);
case ts_symbol_object:
SHIFT(21);
case ts_symbol_string:
SHIFT(21);
case ts_symbol_value:
SHIFT(22);
case ts_aux_left_brace:
SHIFT(26);
case ts_aux_left_bracket:
SHIFT(33);
case ts_aux_comma:
REDUCE(ts_symbol_value, 1, COLLAPSE({0}));
case ts_aux_right_bracket:
REDUCE(ts_symbol_value, 1, COLLAPSE({0}));
default:
PARSE_PANIC();
}
@ -409,57 +433,47 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_value, 1, COLLAPSE({0}));
SHIFT(22);
case ts_aux_repeat_helper1:
SHIFT(37);
case ts_aux_right_bracket:
REDUCE(ts_symbol_value, 1, COLLAPSE({0}));
REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
case 22:
SET_LEX_STATE(6);
SET_LEX_STATE(9);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
case ts_symbol_array:
SHIFT(20);
case ts_symbol_number:
SHIFT(20);
case ts_symbol_object:
SHIFT(20);
case ts_symbol_string:
SHIFT(20);
case ts_symbol_value:
SHIFT(23);
case ts_aux_repeat_helper1:
SHIFT(38);
case ts_aux_right_bracket:
SHIFT(40);
case ts_aux_left_brace:
SHIFT(25);
case ts_aux_left_bracket:
SHIFT(32);
default:
PARSE_PANIC();
}
case 23:
SET_LEX_STATE(9);
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_array:
SHIFT(21);
case ts_symbol_number:
SHIFT(21);
case ts_symbol_object:
SHIFT(21);
case ts_symbol_string:
SHIFT(21);
case ts_symbol_value:
case ts_aux_comma:
SHIFT(22);
case ts_aux_repeat_helper1:
SHIFT(24);
case ts_aux_left_brace:
SHIFT(26);
case ts_aux_left_bracket:
SHIFT(33);
case ts_aux_right_bracket:
REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
case 24:
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
SHIFT(23);
case ts_aux_repeat_helper1:
SHIFT(25);
case ts_aux_right_bracket:
REDUCE(ts_aux_repeat_helper1, 2, COLLAPSE({1, 0}));
default:
PARSE_PANIC();
}
case 25:
SET_LEX_STATE(8);
switch (LOOKAHEAD_SYM()) {
case ts_aux_right_bracket:
@ -467,23 +481,25 @@ static TSParseResult ts_parse(const char *input) {
default:
PARSE_PANIC();
}
case 26:
case 25:
SET_LEX_STATE(21);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_string:
SHIFT(26);
case ts_aux_right_brace:
SHIFT(31);
default:
PARSE_PANIC();
}
case 26:
SET_LEX_STATE(19);
switch (LOOKAHEAD_SYM()) {
case ts_aux_colon:
SHIFT(27);
default:
PARSE_PANIC();
}
case 27:
SET_LEX_STATE(19);
switch (LOOKAHEAD_SYM()) {
case ts_aux_colon:
SHIFT(28);
default:
PARSE_PANIC();
}
case 28:
SET_LEX_STATE(9);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_array:
@ -495,31 +511,41 @@ static TSParseResult ts_parse(const char *input) {
case ts_symbol_string:
SHIFT(6);
case ts_symbol_value:
SHIFT(29);
SHIFT(28);
case ts_aux_left_brace:
SHIFT(13);
case ts_aux_left_bracket:
SHIFT(20);
SHIFT(19);
default:
PARSE_PANIC();
}
case 29:
case 28:
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
SHIFT(8);
case ts_aux_repeat_helper2:
SHIFT(30);
SHIFT(29);
case ts_aux_right_brace:
SHIFT(32);
REDUCE(ts_aux_repeat_helper2, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
case 29:
SET_LEX_STATE(5);
switch (LOOKAHEAD_SYM()) {
case ts_aux_right_brace:
SHIFT(30);
default:
PARSE_PANIC();
}
case 30:
SET_LEX_STATE(5);
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_right_brace:
SHIFT(31);
case ts_aux_comma:
REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1}));
case ts_aux_right_bracket:
REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1}));
default:
PARSE_PANIC();
}
@ -527,59 +553,61 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1}));
REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1}));
case ts_aux_right_bracket:
REDUCE(ts_symbol_object, 6, COLLAPSE({1, 0, 1, 0, 1, 1}));
REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1}));
default:
PARSE_PANIC();
}
case 32:
SET_LEX_STATE(6);
SET_LEX_STATE(22);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1}));
case ts_symbol_array:
SHIFT(20);
case ts_symbol_number:
SHIFT(20);
case ts_symbol_object:
SHIFT(20);
case ts_symbol_string:
SHIFT(20);
case ts_symbol_value:
SHIFT(33);
case ts_aux_left_brace:
SHIFT(25);
case ts_aux_left_bracket:
SHIFT(32);
case ts_aux_right_bracket:
REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1}));
SHIFT(36);
default:
PARSE_PANIC();
}
case 33:
SET_LEX_STATE(9);
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_array:
SHIFT(21);
case ts_symbol_number:
SHIFT(21);
case ts_symbol_object:
SHIFT(21);
case ts_symbol_string:
SHIFT(21);
case ts_symbol_value:
case ts_aux_comma:
SHIFT(22);
case ts_aux_repeat_helper1:
SHIFT(34);
case ts_aux_left_brace:
SHIFT(26);
case ts_aux_left_bracket:
SHIFT(33);
case ts_aux_right_bracket:
REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
case 34:
SET_LEX_STATE(6);
SET_LEX_STATE(8);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
SHIFT(23);
case ts_aux_repeat_helper1:
SHIFT(35);
case ts_aux_right_bracket:
SHIFT(37);
SHIFT(35);
default:
PARSE_PANIC();
}
case 35:
SET_LEX_STATE(8);
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
case ts_aux_right_bracket:
SHIFT(36);
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
default:
PARSE_PANIC();
}
@ -587,27 +615,27 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1}));
case ts_aux_right_bracket:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1}));
default:
PARSE_PANIC();
}
case 37:
SET_LEX_STATE(6);
SET_LEX_STATE(8);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1}));
case ts_aux_right_bracket:
REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1}));
SHIFT(38);
default:
PARSE_PANIC();
}
case 38:
SET_LEX_STATE(8);
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_right_bracket:
SHIFT(39);
case ts_aux_comma:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
case ts_aux_right_brace:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
default:
PARSE_PANIC();
}
@ -615,9 +643,9 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1}));
case ts_aux_right_brace:
REDUCE(ts_symbol_array, 4, COLLAPSE({1, 0, 1, 1}));
REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1}));
default:
PARSE_PANIC();
}
@ -625,9 +653,9 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(2);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1}));
REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1}));
case ts_aux_right_brace:
REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1}));
REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1}));
default:
PARSE_PANIC();
}
@ -651,27 +679,29 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(0);
switch (LOOKAHEAD_SYM()) {
case ts_aux_end:
REDUCE(ts_symbol_object, 5, COLLAPSE({1, 0, 1, 0, 1}));
REDUCE(ts_symbol_object, 2, COLLAPSE({1, 1}));
default:
PARSE_PANIC();
}
case 44:
SET_LEX_STATE(9);
SET_LEX_STATE(22);
switch (LOOKAHEAD_SYM()) {
case ts_symbol_array:
SHIFT(21);
SHIFT(20);
case ts_symbol_number:
SHIFT(21);
SHIFT(20);
case ts_symbol_object:
SHIFT(21);
SHIFT(20);
case ts_symbol_string:
SHIFT(21);
SHIFT(20);
case ts_symbol_value:
SHIFT(45);
case ts_aux_left_brace:
SHIFT(26);
SHIFT(25);
case ts_aux_left_bracket:
SHIFT(33);
SHIFT(32);
case ts_aux_right_bracket:
SHIFT(48);
default:
PARSE_PANIC();
}
@ -679,11 +709,11 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(6);
switch (LOOKAHEAD_SYM()) {
case ts_aux_comma:
SHIFT(23);
SHIFT(22);
case ts_aux_repeat_helper1:
SHIFT(46);
case ts_aux_right_bracket:
SHIFT(48);
REDUCE(ts_aux_repeat_helper1, 0, COLLAPSE({}));
default:
PARSE_PANIC();
}
@ -707,7 +737,7 @@ static TSParseResult ts_parse(const char *input) {
SET_LEX_STATE(0);
switch (LOOKAHEAD_SYM()) {
case ts_aux_end:
REDUCE(ts_symbol_array, 3, COLLAPSE({1, 0, 1}));
REDUCE(ts_symbol_array, 2, COLLAPSE({1, 1}));
default:
PARSE_PANIC();
}

View file

@ -25,14 +25,23 @@ describe("json", []() {
});
it("parses objects", [&]() {
TSDocumentSetText(document, "{\"key1\":1,\"key2\":2}");
AssertThat(string(TSDocumentToString(document)), Equals("(value (object (string) (value (number)) (string) (value (number))))"));
TSDocumentSetText(document, "{}");
AssertThat(string(TSDocumentToString(document)), Equals("(value (object))"));
TSDocumentSetText(document, "{\"key1\":1}");
AssertThat(string(TSDocumentToString(document)), Equals("(value (object (string) (value (number))))"));
TSDocumentSetText(document, "{\"key1\":1,\"key2\":2}");
AssertThat(string(TSDocumentToString(document)), Equals("(value (object (string) (value (number)) (string) (value (number))))"));
});
it("parses arrays", [&]() {
TSDocumentSetText(document, "[]");
AssertThat(string(TSDocumentToString(document)), Equals("(value (array))"));
TSDocumentSetText(document, "[5]");
AssertThat(string(TSDocumentToString(document)), Equals("(value (array (value (number))))"));
TSDocumentSetText(document, "[1,2,3]");
AssertThat(string(TSDocumentToString(document)), Equals("(value (array (value (number)) (value (number)) (value (number))))"));
});

View file

@ -32,15 +32,9 @@ namespace tree_sitter {
value = set_union(apply(rule->left, grammar), apply(rule->right, grammar));
}
bool can_be_blank(const rule_ptr &rule) {
if (rule_can_be_blank(rule)) return true;
auto symbol = std::dynamic_pointer_cast<const Symbol>(rule);
return (symbol.get() && grammar.has_definition(*symbol) && rule_can_be_blank(grammar.rule(*symbol)));
}
void visit(const Seq *rule) {
value = apply(rule->left, grammar);
if (can_be_blank(rule->left)) {
if (rule_can_be_blank(rule->left, grammar)) {
value = set_union(value, apply(rule->right, grammar));
}
}

View file

@ -21,7 +21,7 @@ namespace tree_sitter {
rule_ptr next_rule = pair.second;
if (grammar.has_definition(symbol)) {
set<Symbol> following_non_terminals = first_set(next_rule, grammar);
if (rule_can_be_blank(next_rule))
if (rule_can_be_blank(next_rule, grammar))
following_non_terminals.insert(item.lookahead_sym);
result.insert({ symbol, following_non_terminals });
}

View file

@ -5,6 +5,8 @@
#include "rules.h"
#include "grammar.h"
#include "stream_methods.h"
namespace tree_sitter {
using std::pair;
using std::string;
@ -109,6 +111,22 @@ namespace tree_sitter {
return state_index;
}
// TODO - remove
void dump_item_sets() {
std::vector<const ParseItemSet *> item_sets(parse_state_indices.size());
for (auto &pair : parse_state_indices)
item_sets[pair.second] = &pair.first;
for (int i = 0; i < item_sets.size(); i++) {
std:cout << "\n\n" << i;
for (auto &item : *item_sets[i]) {
cout << "\n" << item.lhs;
cout << "\n " << item.rule;
cout << "\n " << item.lookahead_sym.name;
}
}
}
public:
TableBuilder(const Grammar &grammar, const Grammar &lex_grammar) :

View file

@ -1,4 +1,5 @@
#include "rule_can_be_blank.h"
#include "grammar.h"
#include "rules.h"
namespace tree_sitter {
@ -35,5 +36,11 @@ namespace tree_sitter {
rule->accept(visitor);
return visitor.value;
}
bool rule_can_be_blank(const rule_ptr &rule, const Grammar &grammar) {
if (rule_can_be_blank(rule)) return true;
auto symbol = std::dynamic_pointer_cast<const Symbol>(rule);
return (symbol.get() && grammar.has_definition(*symbol) && rule_can_be_blank(grammar.rule(*symbol), grammar));
}
}
}

View file

@ -4,8 +4,11 @@
#include "rule.h"
namespace tree_sitter {
class Grammar;
namespace build_tables {
bool rule_can_be_blank(const rules::rule_ptr &rule);
bool rule_can_be_blank(const rules::rule_ptr &rule, const Grammar &grammar);
}
}

View file

@ -19,9 +19,12 @@ namespace tree_sitter {
}
rule_ptr make_repeat_helper(string name, const rule_ptr &rule) {
return seq({
rule,
choice({ aux_sym(name), blank() })
return choice({
seq({
rule,
aux_sym(name),
}),
blank(),
});
}

View file

@ -2,8 +2,8 @@ TODO
====
## correct batch parsing
- allow spaces between symbols by default
- add comments to generated C code giving an example string for each token
- change the meaning of 'repeat' from 1-or-more to 0-or-more
- fix any memory leaks
- add special lexical behavior for indentation-aware languages