diff --git a/include/runtime.h b/include/runtime.h index eccf8b17..5ef700fe 100644 --- a/include/runtime.h +++ b/include/runtime.h @@ -36,6 +36,7 @@ void TSParserError(TSParser *parser); TSState TSParserParseState(const TSParser *parser); TSState TSParserLexState(const TSParser *parser); TSSymbol TSParserLookahead(const TSParser *parser); +void TSParserSetLexState(const TSParser *parser, TSState state); #pragma mark - DSL @@ -62,6 +63,9 @@ TSSymbol TSParserLookahead(const TSParser *parser); #define SHIFT(number) \ { TSParserShift(&parser, number); break; } +#define SET_LEX_STATE(state_index) \ + { TSParserSetLexState(&parser, state_index); } + #define ADVANCE(state_index) \ { break; } diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index 28cc68ad..7316c18e 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -1,5 +1,4 @@ #include "runtime.h" -#include #include typedef enum { @@ -15,7 +14,6 @@ typedef enum { ts_symbol___END__ } ts_symbol; - static void ts_lex(TSParser *parser) { START_LEXER(); switch (LEX_STATE()) { @@ -135,6 +133,7 @@ TSTree ts_parse_arithmetic(const char *input) { START_PARSER(); switch (PARSE_STATE()) { case 0: + SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { case ts_symbol_left_paren: SHIFT(9); @@ -152,6 +151,7 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 1: + SET_LEX_STATE(12); switch (LOOKAHEAD_SYM()) { case ts_symbol___END__: ACCEPT_INPUT(); @@ -159,6 +159,7 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 2: + SET_LEX_STATE(13); switch (LOOKAHEAD_SYM()) { case ts_symbol_plus: SHIFT(3); @@ -166,6 +167,7 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 3: + SET_LEX_STATE(15); switch (LOOKAHEAD_SYM()) { case ts_symbol_variable: SHIFT(8); @@ -181,11 +183,13 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 4: + SET_LEX_STATE(17); switch (LOOKAHEAD_SYM()) { default: REDUCE(ts_symbol_expression, 3); } case 5: + SET_LEX_STATE(18); switch (LOOKAHEAD_SYM()) { case ts_symbol_times: SHIFT(6); @@ -193,6 +197,7 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 6: + SET_LEX_STATE(20); switch (LOOKAHEAD_SYM()) { case ts_symbol_left_paren: SHIFT(9); @@ -206,16 +211,19 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 7: + SET_LEX_STATE(21); switch (LOOKAHEAD_SYM()) { default: REDUCE(ts_symbol_term, 3); } case 8: + SET_LEX_STATE(22); switch (LOOKAHEAD_SYM()) { default: REDUCE(ts_symbol_factor, 1); } case 9: + SET_LEX_STATE(23); switch (LOOKAHEAD_SYM()) { case ts_symbol_left_paren: SHIFT(9); @@ -233,6 +241,7 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 10: + SET_LEX_STATE(24); switch (LOOKAHEAD_SYM()) { case ts_symbol_right_paren: SHIFT(11); @@ -240,6 +249,7 @@ TSTree ts_parse_arithmetic(const char *input) { PARSE_ERROR(); } case 11: + SET_LEX_STATE(26); switch (LOOKAHEAD_SYM()) { default: REDUCE(ts_symbol_factor, 3); diff --git a/src/compiler/code_gen/c_code.cpp b/src/compiler/code_gen/c_code.cpp index 6664eacc..29dd9034 100644 --- a/src/compiler/code_gen/c_code.cpp +++ b/src/compiler/code_gen/c_code.cpp @@ -116,12 +116,14 @@ namespace tree_sitter { } } - string switch_on_lookahead_sym(const ParseState &parse_state) { + string code_for_parse_state(const ParseState &parse_state) { string body = ""; for (auto pair : parse_state.actions) body += _case(symbol_id(pair.first), code_for_parse_actions(pair.second)); body += _default(code_for_parse_actions(parse_state.default_actions)); - return _switch("LOOKAHEAD_SYM()", body); + return + string("SET_LEX_STATE(") + to_string(parse_state.lex_state_index) + ");\n" + + _switch("LOOKAHEAD_SYM()", body); } string switch_on_lookahead_char(const LexState &parse_state) { @@ -135,7 +137,7 @@ namespace tree_sitter { string switch_on_parse_state() { string body = ""; for (int i = 0; i < parse_table.states.size(); i++) - body += _case(std::to_string(i), switch_on_lookahead_sym(parse_table.states[i])); + body += _case(std::to_string(i), code_for_parse_state(parse_table.states[i])); body += _default("PARSE_ERROR();"); return _switch("PARSE_STATE()", body); } @@ -154,13 +156,12 @@ namespace tree_sitter { result += indent(symbol_id(rule_name)) + ",\n"; result += indent(symbol_id(ParseTable::END_OF_INPUT)); return result + "\n" - "} ts_symbol;\n"; + "} ts_symbol;"; } string includes() { return string( "#include \"runtime.h\"\n" - "#include \n" "#include "); } diff --git a/src/compiler/lr/item_set.cpp b/src/compiler/lr/item_set.cpp index fef8c66c..987cf4e5 100644 --- a/src/compiler/lr/item_set.cpp +++ b/src/compiler/lr/item_set.cpp @@ -61,6 +61,7 @@ namespace tree_sitter { } #pragma mark - container + ItemSet::const_iterator ItemSet::begin() const { return contents.begin(); } @@ -74,6 +75,7 @@ namespace tree_sitter { } #pragma mark - printing + ostream& operator<<(ostream &stream, const ItemSet &item_set) { stream << string("(item_set "); for (Item item : item_set) { diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 42991bb2..87adda57 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -43,3 +43,7 @@ TSState TSParserParseState(const TSParser *parser) { TSState TSParserLexState(const TSParser *parser) { return 5; } + +void TSParserSetLexState(const TSParser *parser, TSState lex_state) { + +} diff --git a/todo.md b/todo.md index 7ca5a701..3caf1505 100644 --- a/todo.md +++ b/todo.md @@ -1,36 +1,12 @@ TODO ==== -# complete the list of rule types - -- add repeat rules -- parse regex rules into trees of choices, sequences, repeats - -# generate lexers for sets of terminal rules (can be mix of throwaway and meaningful) - -Introduce ParseTable type which contains a vector of ParseStates. A ParseState contains a -TransitionMap of ParseActions. For a lexer, a ParseAction can be one of: - - Accept(symbol) - - Advance(state index) - -Then generate a C function for a ParseTable - -# generate parsers from sets of non-termina rules - -For a Parser, the ParseActions can be any of: - - Accept(symbol) - - Shift(state_index) - - Reduce(symbol, number of child symbols) - # normalize grammars -- add concept of throwaway-terminals (tokens that won't appear in constructed AST) -- classify rules as non-terminals or terminals -- extract strings and regexes from non-terminal rules into their own throwaway-terminals, - in order to separate lexing from parsing +- separate rules into non-terminals and terminals +- extract strings and regexes from non-terminal rules into their own terminals, + in order to separate lexing from parsing -After this, a grammar will have these fields: -- non-terminal rules -- terminal rules -- throwaway terminal rules +# refine +- add concept of throwaway terminals (tokens that won't appear in constructed AST)