Start building AST in parser

This commit is contained in:
Max Brunsfeld 2014-01-07 21:50:32 -08:00
parent 5813816179
commit 614e497ac4
12 changed files with 262 additions and 50 deletions

View file

@ -1,21 +1,22 @@
#ifndef __tree_sitter_document_h__
#define __tree_sitter_document_h__
#include "./tree.h"
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
typedef struct TSDocument TSDocument;
typedef TSTree * TSDocumentParseFn(const char *);
typedef size_t TSSymbol;
typedef struct {
} TSTree;
TSTree TSTreeMake();
TSDocument * TSDocumentMake();
void TSDocumentSetUp(TSDocument *document, TSDocumentParseFn fn, const char **symbol_names);
void TSDocumentSetText(TSDocument *document, const char *text);
TSTree * TSDocumentTree(const TSDocument *document);
char * TSDocumentToString(const TSDocument *document);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,7 +1,7 @@
#ifndef __tree_sitter_parser_h__
#define __tree_sitter_parser_h__
#include "document.h"
#include "tree.h"
#include <stdio.h>
#ifdef __cplusplus
@ -9,16 +9,12 @@ extern "C" {
#endif
typedef int TSState;
typedef struct {
TSState state;
TSSymbol symbol;
} TSStackEntry;
typedef struct {
TSTree tree;
typedef struct TSStackEntry TSStackEntry;
typedef struct TSParser {
TSTree *tree;
const char *input;
size_t position;
TSSymbol lookahead_sym;
TSTree *lookahead_node;
TSState lex_state;
TSStackEntry *stack;
size_t stack_size;
@ -28,31 +24,28 @@ TSParser TSParserMake(const char *input);
void TSParserShift(TSParser *parser, TSState state);
void TSParserReduce(TSParser *parser, TSSymbol symbol, int child_count);
void TSParserError(TSParser *parser);
void TSParserAcceptInput(TSParser *parser);
void TSParserAdvance(TSParser *parser, TSState lex_state);
TSState TSParserParseState(const TSParser *parser);
TSState TSParserLexState(const TSParser *parser);
void TSParserSetLexState(TSParser *parser, TSState state);
char TSParserLookaheadChar(const TSParser *parser);
TSSymbol TSParserLookaheadSym(const TSParser *parser);
long TSParserLookaheadSym(const TSParser *parser);
void TSParserSetLookaheadSym(TSParser *parser, TSSymbol symbol);
#pragma mark - DSL
#define START_PARSER() \
TSParser p = TSParserMake(input), *parser = &p; \
next_state: \
printf("parse state: %d, lookahead: %s\n", PARSE_STATE(), LOOKAHEAD_SYM_NAME());
next_state:
#define LOOKAHEAD_SYM_NAME() \
ts_symbol_names[LOOKAHEAD_SYM()]
#define START_LEXER() \
if (parser->lookahead_sym > 0) return; \
next_state: \
printf("lex state: %d, lookahead: %c\n", LEX_STATE(), LOOKAHEAD_CHAR()); \
if (LOOKAHEAD_CHAR() == '\0') { \
ACCEPT_TOKEN(ts_symbol___END__); \
} \
if (LOOKAHEAD_SYM() > 0) return; \
if (LOOKAHEAD_CHAR() == '\0') { ACCEPT_TOKEN(ts_symbol___END__); } \
next_state:
#define LOOKAHEAD_SYM() \
TSParserLookaheadSym(parser)
@ -76,7 +69,7 @@ TSParserLexState(parser)
{ TSParserAdvance(parser, state_index); goto next_state; }
#define ACCEPT_INPUT() \
{ goto done; }
{ TSParserAcceptInput(parser); goto done; }
#define ACCEPT_TOKEN(symbol) \
{ TSParserSetLookaheadSym(parser, symbol); goto done; }

28
include/tree.h Normal file
View file

@ -0,0 +1,28 @@
#ifndef __tree_sitter_tree_h__
#define __tree_sitter_tree_h__
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef size_t TSSymbol;
typedef struct TSTree {
TSSymbol value;
struct TSTree **children;
size_t child_count;
size_t ref_count;
} TSTree;
TSTree * TSTreeMake(TSSymbol value, size_t child_count, TSTree **children);
void TSTreeRetain(TSTree *tree);
void TSTreeRelease(TSTree *tree);
int TSTreeEquals(const TSTree *tree1, const TSTree *tree2);
char * TSTreeToString(const TSTree *tree, const char **names);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -10,22 +10,22 @@ namespace test_grammars {
{ "expression", choice({
seq({
sym("term"),
character('+'),
str("+"),
sym("term") }),
sym("term") }) },
{ "term", choice({
seq({
sym("factor"),
character('*'),
str("*"),
sym("factor") }),
sym("factor") }) },
{ "factor", choice({
sym("variable"),
sym("number"),
seq({
character('('),
str("("),
sym("expression"),
character(')') }) }) },
str(")") }) }) },
{ "number", pattern("\\d+") },
{ "variable", pattern("\\w+") },
});

View file

@ -1,4 +1,5 @@
#include "parser.h"
#include "document.h"
#include <ctype.h>
enum ts_symbol {
@ -74,7 +75,7 @@ static void ts_lex(TSParser *parser) {
FINISH_LEXER();
}
TSTree ts_parse_arithmetic(const char *input) {
static TSTree * ts_parse(const char *input) {
START_PARSER();
switch (PARSE_STATE()) {
case 0:
@ -204,3 +205,7 @@ TSTree ts_parse_arithmetic(const char *input) {
}
FINISH_PARSER();
}
void TSDocumentSetUp_arithmetic(TSDocument *document) {
TSDocumentSetUp(document, ts_parse, ts_symbol_names);
}

View file

@ -5,7 +5,10 @@ START_TEST
describe("arithmetic", []() {
it("parses_numbers", [&]() {
ts_parse_arithmetic("5");
TSDocument *document = TSDocumentMake();
TSDocumentSetUp_arithmetic(document);
TSDocumentSetText(document, "5");
printf("%s", TSDocumentToString(document));
});
});

View file

@ -0,0 +1,59 @@
#include "spec_helper.h"
#include "tree.h"
START_TEST
enum { cat, dog, pig };
static const char *names[] = { "cat", "dog", "pig" };
describe("trees", []() {
TSTree *tree1, *parent1;
before_each([&]() {
tree1 = TSTreeMake(cat, 0, NULL);
parent1 = TSTreeMake(dog, 1, &tree1);
});
after_each([&]() {
TSTreeRelease(tree1);
TSTreeRelease(parent1);
});
describe("equality", [&]() {
it("returns true for identical trees", [&]() {
TSTree *tree2 = TSTreeMake(cat, 0, NULL);
AssertThat(TSTreeEquals(tree1, tree2), Equals(1));
TSTree *parent2 = TSTreeMake(dog, 1, &tree2);
AssertThat(TSTreeEquals(parent1, parent2), Equals(1));
TSTreeRelease(tree2);
TSTreeRelease(parent2);
});
it("returns false for different trees", [&]() {
TSTree *different_tree = TSTreeMake(pig, 0, NULL);
AssertThat(TSTreeEquals(tree1, different_tree), Equals(0));
TSTree *different_parent = TSTreeMake(dog, 1, &different_tree);
AssertThat(TSTreeEquals(parent1, different_parent), Equals(0));
TSTree *parent_with_same_type = TSTreeMake(cat, 1, &different_parent);
AssertThat(TSTreeEquals(parent_with_same_type, tree1), Equals(0));
AssertThat(TSTreeEquals(tree1, parent_with_same_type), Equals(0));
TSTreeRelease(different_tree);
TSTreeRelease(different_parent);
TSTreeRelease(parent_with_same_type);
});
});
describe("serialization", [&]() {
it("returns a readable string", [&]() {
AssertThat(string(TSTreeToString(tree1, names)), Equals("(cat)"));
AssertThat(string(TSTreeToString(parent1, names)), Equals("(dog (cat))"));
});
});
});
END_TEST

View file

@ -192,13 +192,14 @@ namespace tree_sitter {
string includes() {
return join({
"#include \"parser.h\"",
"#include \"document.h\"",
"#include <ctype.h>"
});
}
string parse_function() {
return join({
"TSTree ts_parse_arithmetic(const char *input) {",
"static TSTree * ts_parse(const char *input) {",
indent("START_PARSER();"),
indent(switch_on_parse_state()),
indent("FINISH_PARSER();"),
@ -216,13 +217,22 @@ namespace tree_sitter {
});
}
string setup_function() {
return join({
"void TSDocumentSetUp_arithmetic(TSDocument *document) {",
indent("TSDocumentSetUp(document, ts_parse, ts_symbol_names);"),
"}"
});
}
string code() {
return join({
includes(),
symbol_enum(),
rule_names_list(),
lex_function(),
parse_function()
parse_function(),
setup_function(),
}, "\n\n") + "\n";
}
};

34
src/runtime/document.c Normal file
View file

@ -0,0 +1,34 @@
#include "document.h"
struct TSDocument {
TSDocumentParseFn *parse_fn;
const char **symbol_names;
const char *text;
TSTree *tree;
};
TSDocument * TSDocumentMake() {
TSDocument *result = malloc(sizeof(TSDocument));
return result;
}
void TSDocumentSetUp(TSDocument *document, TSDocumentParseFn fn, const char **symbol_names) {
document->parse_fn = fn;
document->symbol_names = symbol_names;
}
void TSDocumentSetText(TSDocument *document, const char *text) {
document->text = text;
document->tree = document->parse_fn(document->text);
}
TSTree * TSDocumentTree(const TSDocument *document) {
return document->tree;
}
char * TSDocumentToString(const TSDocument *document) {
if (document->tree)
return TSTreeToString(document->tree, document->symbol_names);
else
return "#<null tree>";
}

View file

@ -3,12 +3,17 @@
static int INITIAL_STACK_SIZE = 100;
struct TSStackEntry {
TSState state;
TSTree *node;
};
TSParser TSParserMake(const char *input) {
TSParser result = {
.tree = TSTreeMake(),
.tree = NULL,
.input = input,
.position = 0,
.lookahead_sym = 0,
.lookahead_node = NULL,
.lex_state = 0,
.stack = calloc(INITIAL_STACK_SIZE, sizeof(TSStackEntry)),
.stack_size = 0,
@ -19,14 +24,21 @@ TSParser TSParserMake(const char *input) {
void TSParserShift(TSParser *parser, TSState parse_state) {
TSStackEntry *entry = (parser->stack + parser->stack_size);
entry->state = parse_state;
entry->symbol = parser->lookahead_sym;
parser->lookahead_sym = -1;
entry->node = parser->lookahead_node;
parser->lookahead_node = NULL;
parser->stack_size++;
}
void TSParserReduce(TSParser *parser, TSSymbol symbol, int child_count) {
parser->lookahead_sym = symbol;
parser->stack_size -= child_count;
TSTree **children = malloc(child_count * sizeof(TSTree *));
for (int i = 0; i < child_count; i++) {
size_t j = parser->stack_size + i;
children[i] = parser->stack[j].node;
}
parser->lookahead_node = TSTreeMake(symbol, child_count, children);
}
void TSParserError(TSParser *parser) {
@ -42,12 +54,13 @@ char TSParserLookaheadChar(const TSParser *parser) {
return parser->input[parser->position];
}
TSSymbol TSParserLookaheadSym(const TSParser *parser) {
return parser->lookahead_sym;
long TSParserLookaheadSym(const TSParser *parser) {
TSTree *node = parser->lookahead_node;
return node ? node->value : -1;
}
void TSParserSetLookaheadSym(TSParser *parser, TSSymbol symbol) {
parser->lookahead_sym = symbol;
parser->lookahead_node = TSTreeMake(symbol, 0, NULL);
}
TSState TSParserParseState(const TSParser *parser) {
@ -61,3 +74,8 @@ TSState TSParserLexState(const TSParser *parser) {
void TSParserSetLexState(TSParser *parser, TSState lex_state) {
parser->lex_state = lex_state;
}
void TSParserAcceptInput(TSParser *parser) {
parser->tree = parser->stack[parser->stack_size - 1].node;
}

View file

@ -1,7 +1,58 @@
#include "document.h"
#include "tree.h"
#include <stdio.h>
#include <string.h>
TSTree TSTreeMake() {
TSTree result = {
};
TSTree * TSTreeMake(TSSymbol value, size_t child_count, TSTree **children) {
TSTree *result = malloc(sizeof(TSTree));
result->value = value;
result->child_count = child_count;
result->children = children;
for (int i = 0; i < child_count; i++)
TSTreeRetain(children[i]);
return result;
}
}
void TSTreeRetain(TSTree *tree) {
tree->ref_count++;
}
void TSTreeRelease(TSTree *tree) {
tree->ref_count--;
if (tree->ref_count == 0) {
for (int i = 0; i < tree->child_count; i++)
TSTreeRelease(tree->children[i]);
free(tree);
}
}
int TSTreeEquals(const TSTree *node1, const TSTree *node2) {
if (node1->value != node2->value) return 0;
if (node1->child_count != node2->child_count) return 0;
for (int i = 0; i < node1->child_count; i++) {
TSTree *child1 = node1->children[i];
TSTree *child2 = node2->children[i];
if (!TSTreeEquals(child1, child2)) return 0;
}
return 1;
}
char * TSTreeWriteToString(const TSTree *tree, const char **symbol_names, char *string) {
char *result = string;
const char *name = symbol_names[tree->value];
sprintf(result, "(%s", name);
result += strlen(name) + 1;
for (int i = 0; i < tree->child_count; i++) {
result[0] = ' ';
result++;
result = TSTreeWriteToString(tree->children[i], symbol_names, result);
}
result[0] = ')';
result++;
return result;
}
char * TSTreeToString(const TSTree *tree, const char **symbol_names) {
char *string = calloc(100, sizeof(char));
TSTreeWriteToString(tree, symbol_names, string);
return string;
}

View file

@ -26,6 +26,8 @@
129D242C183EB1EB00FE9F71 /* table_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 129D242A183EB1EB00FE9F71 /* table_builder.cpp */; };
12D136A4183678A2005F3369 /* repeat.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12D136A2183678A2005F3369 /* repeat.cpp */; };
12ED72A7186FC8220089229B /* transitions_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12ED72A6186FC8220089229B /* transitions_spec.cpp */; };
12EDCF8A187B498C005A7A07 /* tree_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF89187B498C005A7A07 /* tree_spec.cpp */; };
12EDCF8D187C6282005A7A07 /* document.c in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8C187C6282005A7A07 /* document.c */; };
12F9A64E182DD5FD00FAF50C /* spec_helper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12F9A64C182DD5FD00FAF50C /* spec_helper.cpp */; };
12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12F9A64F182DD6BC00FAF50C /* grammar.cpp */; };
12FD4061185E68470041A84E /* c_code.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12FD405F185E68470041A84E /* c_code.cpp */; };
@ -134,6 +136,9 @@
12E71794181D02A80051A649 /* compiler_specs */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = compiler_specs; sourceTree = BUILT_PRODUCTS_DIR; };
12E71852181D081C0051A649 /* rules.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = rules.h; path = src/compiler/rules/rules.h; sourceTree = SOURCE_ROOT; };
12ED72A6186FC8220089229B /* transitions_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = transitions_spec.cpp; path = spec/compiler/lr/transitions_spec.cpp; sourceTree = SOURCE_ROOT; };
12EDCF89187B498C005A7A07 /* tree_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tree_spec.cpp; sourceTree = "<group>"; };
12EDCF8B187C6251005A7A07 /* document.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = document.h; sourceTree = "<group>"; };
12EDCF8C187C6282005A7A07 /* document.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = document.c; sourceTree = "<group>"; };
12F9A64C182DD5FD00FAF50C /* spec_helper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = spec_helper.cpp; path = spec/compiler/spec_helper.cpp; sourceTree = SOURCE_ROOT; };
12F9A64D182DD5FD00FAF50C /* spec_helper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = spec_helper.h; path = spec/compiler/spec_helper.h; sourceTree = SOURCE_ROOT; };
12F9A64F182DD6BC00FAF50C /* grammar.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = grammar.cpp; path = src/compiler/grammar/grammar.cpp; sourceTree = SOURCE_ROOT; };
@ -143,7 +148,7 @@
12FD4063185E75290041A84E /* generate_parsers.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = generate_parsers.cpp; path = spec/compiler/generate_parsers.cpp; sourceTree = SOURCE_ROOT; };
12FD4065185E7C2F0041A84E /* arithmetic.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = arithmetic.c; path = spec/fixtures/parsers/arithmetic.c; sourceTree = SOURCE_ROOT; };
12FD40D1185EEB5E0041A84E /* runtime_specs */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = runtime_specs; sourceTree = BUILT_PRODUCTS_DIR; };
12FD40D4185FED9A0041A84E /* document.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = document.h; sourceTree = "<group>"; };
12FD40D4185FED9A0041A84E /* tree.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tree.h; sourceTree = "<group>"; };
12FD40DA185FEF0D0041A84E /* arithmetic_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = arithmetic_spec.cpp; sourceTree = "<group>"; };
12FD40DC185FF12C0041A84E /* parser.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = parser.c; sourceTree = "<group>"; };
12FD40DE1860064C0041A84E /* tree.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = tree.c; sourceTree = "<group>"; };
@ -354,6 +359,7 @@
children = (
12FD40DC185FF12C0041A84E /* parser.c */,
12FD40DE1860064C0041A84E /* tree.c */,
12EDCF8C187C6282005A7A07 /* document.c */,
);
path = runtime;
sourceTree = "<group>";
@ -384,6 +390,7 @@
isa = PBXGroup;
children = (
12FD40DA185FEF0D0041A84E /* arithmetic_spec.cpp */,
12EDCF89187B498C005A7A07 /* tree_spec.cpp */,
);
path = runtime;
sourceTree = "<group>";
@ -391,8 +398,9 @@
12FD40D3185FED630041A84E /* include */ = {
isa = PBXGroup;
children = (
12FD40D4185FED9A0041A84E /* document.h */,
12FD40D4185FED9A0041A84E /* tree.h */,
121D8B3018795CC0003CF44B /* parser.h */,
12EDCF8B187C6251005A7A07 /* document.h */,
);
path = include;
sourceTree = "<group>";
@ -520,6 +528,8 @@
12FD40B6185EEB5E0041A84E /* arithmetic.cpp in Sources */,
12FD40DD185FF12C0041A84E /* parser.c in Sources */,
12FD40B8185EEB5E0041A84E /* item.cpp in Sources */,
12EDCF8A187B498C005A7A07 /* tree_spec.cpp in Sources */,
12EDCF8D187C6282005A7A07 /* document.c in Sources */,
12FD40B9185EEB5E0041A84E /* string.cpp in Sources */,
12FD40EF186641510041A84E /* transitions.cpp in Sources */,
12FD40BB185EEB5E0041A84E /* blank.cpp in Sources */,