In parser, read chunked input correctly

2014-03-10 13:25:31 -07:00 · 2014-03-10 13:25:31 -07:00 · 42e9a264f3
commit 42e9a264f3
parent 3aaa08b948
6 changed files with 134 additions and 18 deletions
--- a/include/tree_sitter/parser.h
+++ b/include/tree_sitter/parser.h
@ -38,11 +38,15 @@ typedef struct {

 typedef struct {
    ts_input input;
-    const char *current_chunk;
-    size_t current_chunk_end;
-    size_t position;
+
+    const char *chunk;
+    size_t chunk_start;
+    size_t chunk_size;
+    size_t position_in_chunk;
+    
    size_t token_end_position;
    size_t token_start_position;
+
    ts_tree *lookahead_node;
    ts_tree *prev_lookahead_node;
    ts_state lex_state;
@ -60,9 +64,12 @@ static ts_parser ts_parser_make(ts_input input) {
        .input = input,
        .token_start_position = 0,
        .token_end_position = 0,
-        .position = 0,
-        .current_chunk = chunk,
-        .current_chunk_end = bytes_read,
+
+        .chunk = chunk,
+        .chunk_size = bytes_read,
+        .chunk_start = 0,
+        .position_in_chunk = 0,
+
        .lookahead_node = NULL,
        .prev_lookahead_node = NULL,
        .lex_state = 0,
@ -71,9 +78,13 @@ static ts_parser ts_parser_make(ts_input input) {
    };
    return result;
 }
+    
+static size_t ts_parser_position(const ts_parser *parser) {
+    return parser->chunk_start + parser->position_in_chunk;
+}

 static char ts_parser_lookahead_char(const ts_parser *parser) {
-    return parser->current_chunk[parser->position];
+    return parser->chunk[parser->position_in_chunk];
 }

 static ts_symbol ts_parser_lookahead_sym(const ts_parser *parser) {
@ -143,13 +154,19 @@ static void ts_parser_reduce(ts_parser *parser, ts_symbol symbol, int immediate_
    DEBUG_PARSE("reduce: %s, state: %u \n", ts_symbol_names[symbol], ts_parser_parse_state(parser));
 }

+static const char empty_chunk[1] = { '\0' };
+
 static void ts_parser_advance(ts_parser *parser) {
-    if (parser->position < parser->current_chunk_end) {
-        parser->position++;
+    if (parser->position_in_chunk < parser->chunk_size - 1) {
+        parser->position_in_chunk++;
    } else {
-        size_t bytes_read = 0;
-        parser->current_chunk = parser->input.read_fn(parser->input.data, &bytes_read);
-        parser->current_chunk_end += bytes_read;
+        parser->chunk_start += parser->chunk_size;
+        parser->chunk = parser->input.read_fn(parser->input.data, &parser->chunk_size);
+        if (parser->chunk_size == 0) {
+            parser->chunk = empty_chunk;
+            parser->chunk_size = 1;
+        }
+        parser->position_in_chunk = 0;
    }
 }

@ -161,10 +178,11 @@ static void ts_parser_advance_to_state(ts_parser *parser, ts_state lex_state) {

 static void ts_parser_set_lookahead_sym(ts_parser *parser, ts_symbol symbol) {
    DEBUG_LEX("token: %s \n", ts_symbol_names[symbol]);
-    size_t size = parser->position - parser->token_start_position;
+    size_t position = ts_parser_position(parser);
+    size_t size = position - parser->token_start_position;
    size_t offset = parser->token_start_position - parser->token_end_position;
    parser->lookahead_node = ts_tree_make_leaf(symbol, size, offset);
-    parser->token_end_position = parser->position;
+    parser->token_end_position = position;
 }

 static ts_tree * ts_parser_tree(ts_parser *parser) {
@ -175,7 +193,7 @@ static ts_tree * ts_parser_tree(ts_parser *parser) {
 static void ts_parser_skip_whitespace(ts_parser *parser) {
    while (isspace(ts_parser_lookahead_char(parser)))
        ts_parser_advance(parser);
-    parser->token_start_position = parser->position;
+    parser->token_start_position = ts_parser_position(parser);
 }

 static int ts_parser_handle_error(ts_parser *parser, size_t count, const ts_symbol *expected_symbols) {
--- a/spec/runtime/helpers/spy_reader.cc
+++ b/spec/runtime/helpers/spy_reader.cc
@ -0,0 +1,37 @@
+#include "helpers/spy_reader.h"
+#include <algorithm>
+
+using std::string;
+
+static const char * spy_read(void *data, size_t *bytes_read) {
+    SpyReader *reader = static_cast<SpyReader *>(data);
+    size_t size = std::min(reader->chunk_size,
+                           reader->content.length() - reader->position);
+    const char *result = reader->content.data() + reader->position;
+    reader->chunks_read.push_back(string(result, size));
+    reader->position += size;
+    *bytes_read = size;
+    return result;
+}
+
+static int spy_seek(void *data, size_t position) {
+    SpyReader *reader = static_cast<SpyReader *>(data); 
+    reader->position = position;
+    return 0;
+}
+
+static void spy_release(void *data) {
+    SpyReader *reader = static_cast<SpyReader *>(data); 
+    delete reader;
+}
+
+SpyReader::SpyReader(string content, size_t chunk_size) :
+    content(content),
+    position(0),
+    chunk_size(chunk_size),
+    input({
+        .read_fn = spy_read,
+        .seek_fn = spy_seek,
+        .release_fn = spy_release,
+        .data = this
+    }) {}
--- a/spec/runtime/helpers/spy_reader.h
+++ b/spec/runtime/helpers/spy_reader.h
@ -0,0 +1,19 @@
+#ifndef HELPERS_SPY_READER_H_
+#define HELPERS_SPY_READER_H_
+
+#include <string>
+#include <vector>
+#include "tree_sitter/runtime.h"
+
+class SpyReader {
+public:
+    SpyReader(std::string content, size_t chunk_size);
+
+    std::string content;
+    size_t position;
+    size_t chunk_size;
+    ts_input input;
+    std::vector<std::string> chunks_read;
+};
+
+#endif  // HELPERS_SPY_READER_H_
--- a/spec/runtime/parser_spec.cc
+++ b/spec/runtime/parser_spec.cc
@ -0,0 +1,36 @@
+#include "runtime_spec_helper.h"
+#include "helpers/spy_reader.h"
+
+extern ts_parse_config ts_parse_config_json;
+
+START_TEST
+
+describe("reading from an input", [&]() {
+    ts_document *doc;
+    
+    before_each([&]() {
+        doc = ts_document_make();
+        ts_document_set_parser(doc, ts_parse_config_json);
+    });
+    
+    after_each([&]() {
+        ts_document_free(doc);
+    });
+
+    it("reads the entire input", [&]() {
+        SpyReader reader("\"ok go do it!\"", 3);
+        ts_document_set_input(doc, reader.input);
+        
+        AssertThat(string(ts_document_string(doc)), Equals("(value (string))"));
+        AssertThat(reader.chunks_read, Equals(vector<string>({
+            "\"ok",
+            " go",
+            " do",
+            " it",
+            "!\"",
+            ""
+        })));
+    });
+});
+
+END_TEST
--- a/src/compiler/prepared_grammar.h
+++ b/src/compiler/prepared_grammar.h
@ -9,7 +9,7 @@

 namespace tree_sitter {
    class PreparedGrammar : public Grammar {
-    public:
+    public: 
        PreparedGrammar(std::string start_rule_name,
                        const std::map<const std::string, const rules::rule_ptr> &rules,
                        const std::map<const std::string, const rules::rule_ptr> &aux_rules);
--- a/src/runtime/document.c
+++ b/src/runtime/document.c
@ -39,8 +39,14 @@ typedef struct {

 const char * ts_string_input_read(void *d, size_t *bytes_read) {
    ts_string_input_data *data = (ts_string_input_data *)d;
-    *bytes_read = data->length;
-    return data->string + data->position;
+    if (data->position >= data->length) {
+        *bytes_read = 0;
+        return "";
+    }
+    size_t previous_position = data->position;
+    data->position = data->length;
+    *bytes_read = data->position - previous_position;
+    return data->string + previous_position;
 }

 int ts_string_input_seek(void *d, size_t position) {