Add code for setting parse states’ corresponding lex states

2013-12-28 10:23:40 -08:00 · 2013-12-28 10:23:40 -08:00 · 29a9b4643d
commit 29a9b4643d
parent a5e39d2512
6 changed files with 33 additions and 36 deletions
--- a/include/runtime.h
+++ b/include/runtime.h
@ -36,6 +36,7 @@ void TSParserError(TSParser *parser);
 TSState TSParserParseState(const TSParser *parser);
 TSState TSParserLexState(const TSParser *parser);
 TSSymbol TSParserLookahead(const TSParser *parser);
+void TSParserSetLexState(const TSParser *parser, TSState state);

 #pragma mark - DSL

@ -62,6 +63,9 @@ TSSymbol TSParserLookahead(const TSParser *parser);
 #define SHIFT(number) \
    { TSParserShift(&parser, number); break; }

+#define SET_LEX_STATE(state_index) \
+    { TSParserSetLexState(&parser, state_index); }
+
 #define ADVANCE(state_index) \
    { break; }

--- a/spec/fixtures/parsers/arithmetic.c
+++ b/spec/fixtures/parsers/arithmetic.c
@ -1,5 +1,4 @@
 #include "runtime.h"
-#include <stdlib.h>
 #include <ctype.h>

 typedef enum {
@ -15,7 +14,6 @@ typedef enum {
    ts_symbol___END__
 } ts_symbol;

-
 static void ts_lex(TSParser *parser) {
    START_LEXER();
    switch (LEX_STATE()) {
@ -135,6 +133,7 @@ TSTree ts_parse_arithmetic(const char *input) {
    START_PARSER();
    switch (PARSE_STATE()) {
        case 0:
+            SET_LEX_STATE(0);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_left_paren:
                    SHIFT(9);
@ -152,6 +151,7 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 1:
+            SET_LEX_STATE(12);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol___END__:
                    ACCEPT_INPUT();
@ -159,6 +159,7 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 2:
+            SET_LEX_STATE(13);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_plus:
                    SHIFT(3);
@ -166,6 +167,7 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 3:
+            SET_LEX_STATE(15);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_variable:
                    SHIFT(8);
@ -181,11 +183,13 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 4:
+            SET_LEX_STATE(17);
            switch (LOOKAHEAD_SYM()) {
                default:
                    REDUCE(ts_symbol_expression, 3);
            }
        case 5:
+            SET_LEX_STATE(18);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_times:
                    SHIFT(6);
@ -193,6 +197,7 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 6:
+            SET_LEX_STATE(20);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_left_paren:
                    SHIFT(9);
@ -206,16 +211,19 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 7:
+            SET_LEX_STATE(21);
            switch (LOOKAHEAD_SYM()) {
                default:
                    REDUCE(ts_symbol_term, 3);
            }
        case 8:
+            SET_LEX_STATE(22);
            switch (LOOKAHEAD_SYM()) {
                default:
                    REDUCE(ts_symbol_factor, 1);
            }
        case 9:
+            SET_LEX_STATE(23);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_left_paren:
                    SHIFT(9);
@ -233,6 +241,7 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 10:
+            SET_LEX_STATE(24);
            switch (LOOKAHEAD_SYM()) {
                case ts_symbol_right_paren:
                    SHIFT(11);
@ -240,6 +249,7 @@ TSTree ts_parse_arithmetic(const char *input) {
                    PARSE_ERROR();
            }
        case 11:
+            SET_LEX_STATE(26);
            switch (LOOKAHEAD_SYM()) {
                default:
                    REDUCE(ts_symbol_factor, 3);
--- a/src/compiler/code_gen/c_code.cpp
+++ b/src/compiler/code_gen/c_code.cpp
@ -116,12 +116,14 @@ namespace tree_sitter {
                }
            }

-            string switch_on_lookahead_sym(const ParseState &parse_state) {
+            string code_for_parse_state(const ParseState &parse_state) {
                string body = "";
                for (auto pair : parse_state.actions)
                    body += _case(symbol_id(pair.first), code_for_parse_actions(pair.second));
                body += _default(code_for_parse_actions(parse_state.default_actions));
-                return _switch("LOOKAHEAD_SYM()", body);
+                return
+                    string("SET_LEX_STATE(") + to_string(parse_state.lex_state_index) + ");\n" +
+                    _switch("LOOKAHEAD_SYM()", body);
            }

            string switch_on_lookahead_char(const LexState &parse_state) {
@ -135,7 +137,7 @@ namespace tree_sitter {
            string switch_on_parse_state() {
                string body = "";
                for (int i = 0; i < parse_table.states.size(); i++)
-                    body += _case(std::to_string(i), switch_on_lookahead_sym(parse_table.states[i]));
+                    body += _case(std::to_string(i), code_for_parse_state(parse_table.states[i]));
                body += _default("PARSE_ERROR();");
                return _switch("PARSE_STATE()", body);
            }
@ -154,13 +156,12 @@ namespace tree_sitter {
                    result += indent(symbol_id(rule_name)) + ",\n";
                result += indent(symbol_id(ParseTable::END_OF_INPUT));
                return result + "\n"
-                "} ts_symbol;\n";
+                "} ts_symbol;";
            }
            
            string includes() {
                return string(
                    "#include \"runtime.h\"\n"
-                    "#include <stdlib.h>\n"
                    "#include <ctype.h>");
            }
            
--- a/src/compiler/lr/item_set.cpp
+++ b/src/compiler/lr/item_set.cpp
@ -61,6 +61,7 @@ namespace tree_sitter {
        }
        
 #pragma mark - container
+        
        ItemSet::const_iterator ItemSet::begin() const {
            return contents.begin();
        }
@ -74,6 +75,7 @@ namespace tree_sitter {
        }
        
 #pragma mark - printing
+        
        ostream& operator<<(ostream &stream, const ItemSet &item_set) {
            stream << string("(item_set ");
            for (Item item : item_set) {
--- a/src/runtime/parser.c
+++ b/src/runtime/parser.c
@ -43,3 +43,7 @@ TSState TSParserParseState(const TSParser *parser) {
 TSState TSParserLexState(const TSParser *parser) {
    return 5;
 }
+
+void TSParserSetLexState(const TSParser *parser, TSState lex_state) {
+    
+}
--- a/todo.md
+++ b/todo.md
@ -1,36 +1,12 @@
 TODO
 ====

-# complete the list of rule types
-
- add repeat rules
- parse regex rules into trees of choices, sequences, repeats
-
-# generate lexers for sets of terminal rules (can be mix of throwaway and meaningful)
-
-Introduce ParseTable type which contains a vector of ParseStates. A ParseState contains a
-TransitionMap of ParseActions. For a lexer, a ParseAction can be one of:
-    - Accept(symbol)
-    - Advance(state index)
-
-Then generate a C function for a ParseTable
-
-# generate parsers from sets of non-termina rules
-
-For a Parser, the ParseActions can be any of:
-    - Accept(symbol)
-    - Shift(state_index)
-    - Reduce(symbol, number of child symbols)
-    
 # normalize grammars

- add concept of throwaway-terminals (tokens that won't appear in constructed AST)
- classify rules as non-terminals or terminals
- extract strings and regexes from non-terminal rules into their own throwaway-terminals,
-    in order to separate lexing from parsing
+- separate rules into non-terminals and terminals
+- extract strings and regexes from non-terminal rules into their own terminals,
+  in order to separate lexing from parsing

-After this, a grammar will have these fields:
- non-terminal rules
- terminal rules
- throwaway terminal rules
+# refine

+- add concept of throwaway terminals (tokens that won't appear in constructed AST)