From c40411b4d1411f2dd8e9e424cef43bcce6235773 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 22 Mar 2014 15:46:58 -0700 Subject: [PATCH] Handle unexpected tokens properly Also, add null and boolean values to json grammar --- examples/grammars/json.hpp | 8 +- examples/parsers/json.c | 157 ++++++++++++++++++++----- include/tree_sitter/parser.h | 4 + spec/runtime/languages/json/errors.txt | 19 ++- spec/runtime/languages/json/main.txt | 10 +- 5 files changed, 158 insertions(+), 40 deletions(-) diff --git a/examples/grammars/json.hpp b/examples/grammars/json.hpp index d126fad3..01767a5e 100644 --- a/examples/grammars/json.hpp +++ b/examples/grammars/json.hpp @@ -23,7 +23,10 @@ namespace test_grammars { sym("object"), sym("array"), sym("string"), - sym("number") }) }, + sym("number"), + sym("true"), + sym("false"), + sym("null"), }) }, { "object", seq({ _sym("left_brace"), comma_sep(err(seq({ @@ -43,6 +46,9 @@ namespace test_grammars { { "right_bracket", str("]") }, { "left_brace", str("{") }, { "right_brace", str("}") }, + { "null", str("null") }, + { "true", str("true") }, + { "false", str("false") }, }); } } diff --git a/examples/parsers/json.c b/examples/parsers/json.c index c6930964..a4e65d02 100644 --- a/examples/parsers/json.c +++ b/examples/parsers/json.c @@ -1,12 +1,15 @@ #include "tree_sitter/parser.h" -#define TS_SYMBOL_COUNT 15 +#define TS_SYMBOL_COUNT 18 enum { ts_sym_array, + ts_sym_false, + ts_sym_null, ts_sym_number, ts_sym_object, ts_sym_string, + ts_sym_true, ts_sym_value, ts_sym_colon, ts_sym_comma, @@ -20,9 +23,12 @@ enum { SYMBOL_NAMES = { "array", + "false", + "null", "number", "object", "string", + "true", "value", "colon", "comma", @@ -72,8 +78,14 @@ LEX_FN() { ADVANCE(15); if (LOOKAHEAD_CHAR() == '[') ADVANCE(16); - if (LOOKAHEAD_CHAR() == '{') + if (LOOKAHEAD_CHAR() == 'f') ADVANCE(17); + if (LOOKAHEAD_CHAR() == 'n') + ADVANCE(22); + if (LOOKAHEAD_CHAR() == 't') + ADVANCE(26); + if (LOOKAHEAD_CHAR() == '{') + ADVANCE(30); LEX_ERROR(); case 9: if (!((LOOKAHEAD_CHAR() == '\"') || @@ -134,20 +146,66 @@ LEX_FN() { case 16: ACCEPT_TOKEN(ts_sym_left_bracket); case 17: - ACCEPT_TOKEN(ts_sym_left_brace); + if (LOOKAHEAD_CHAR() == 'a') + ADVANCE(18); + LEX_ERROR(); case 18: - if (LOOKAHEAD_CHAR() == ':') + if (LOOKAHEAD_CHAR() == 'l') ADVANCE(19); LEX_ERROR(); case 19: - ACCEPT_TOKEN(ts_sym_colon); + if (LOOKAHEAD_CHAR() == 's') + ADVANCE(20); + LEX_ERROR(); case 20: + if (LOOKAHEAD_CHAR() == 'e') + ADVANCE(21); + LEX_ERROR(); + case 21: + ACCEPT_TOKEN(ts_sym_false); + case 22: + if (LOOKAHEAD_CHAR() == 'u') + ADVANCE(23); + LEX_ERROR(); + case 23: + if (LOOKAHEAD_CHAR() == 'l') + ADVANCE(24); + LEX_ERROR(); + case 24: + if (LOOKAHEAD_CHAR() == 'l') + ADVANCE(25); + LEX_ERROR(); + case 25: + ACCEPT_TOKEN(ts_sym_null); + case 26: + if (LOOKAHEAD_CHAR() == 'r') + ADVANCE(27); + LEX_ERROR(); + case 27: + if (LOOKAHEAD_CHAR() == 'u') + ADVANCE(28); + LEX_ERROR(); + case 28: + if (LOOKAHEAD_CHAR() == 'e') + ADVANCE(29); + LEX_ERROR(); + case 29: + ACCEPT_TOKEN(ts_sym_true); + case 30: + ACCEPT_TOKEN(ts_sym_left_brace); + case 31: + if (LOOKAHEAD_CHAR() == ':') + ADVANCE(32); + LEX_ERROR(); + case 32: + ACCEPT_TOKEN(ts_sym_colon); + case 33: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(9); if (LOOKAHEAD_CHAR() == '}') ADVANCE(3); LEX_ERROR(); - case 21: + case 34: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(9); if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') @@ -156,42 +214,54 @@ LEX_FN() { ADVANCE(16); if (LOOKAHEAD_CHAR() == ']') ADVANCE(6); - if (LOOKAHEAD_CHAR() == '{') + if (LOOKAHEAD_CHAR() == 'f') ADVANCE(17); + if (LOOKAHEAD_CHAR() == 'n') + ADVANCE(22); + if (LOOKAHEAD_CHAR() == 't') + ADVANCE(26); + if (LOOKAHEAD_CHAR() == '{') + ADVANCE(30); LEX_ERROR(); - case 22: + case 35: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(9); LEX_ERROR(); - case 23: + case 36: ACCEPT_TOKEN(ts_sym_comma); - case 24: + case 37: ACCEPT_TOKEN(ts_sym_colon); - case 25: + case 38: ACCEPT_TOKEN(ts_sym_left_bracket); - case 26: + case 39: ACCEPT_TOKEN(ts_sym_right_bracket); - case 27: + case 40: ACCEPT_TOKEN(ts_sym_left_brace); - case 28: + case 41: ACCEPT_TOKEN(ts_sym_right_brace); case ts_lex_state_error: if (LOOKAHEAD_CHAR() == '\"') ADVANCE(9); if (LOOKAHEAD_CHAR() == ',') - ADVANCE(23); + ADVANCE(36); if ('0' <= LOOKAHEAD_CHAR() && LOOKAHEAD_CHAR() <= '9') ADVANCE(15); if (LOOKAHEAD_CHAR() == ':') - ADVANCE(24); + ADVANCE(37); if (LOOKAHEAD_CHAR() == '[') - ADVANCE(25); + ADVANCE(38); if (LOOKAHEAD_CHAR() == ']') + ADVANCE(39); + if (LOOKAHEAD_CHAR() == 'f') + ADVANCE(17); + if (LOOKAHEAD_CHAR() == 'n') + ADVANCE(22); + if (LOOKAHEAD_CHAR() == 't') ADVANCE(26); if (LOOKAHEAD_CHAR() == '{') - ADVANCE(27); + ADVANCE(40); if (LOOKAHEAD_CHAR() == '}') - ADVANCE(28); + ADVANCE(41); LEX_ERROR(); default: LEX_PANIC(); @@ -204,9 +274,12 @@ PARSE_TABLE() { STATE(0); SET_LEX_STATE(8); SHIFT(ts_sym_array, 1) + SHIFT(ts_sym_false, 1) + SHIFT(ts_sym_null, 1) SHIFT(ts_sym_number, 1) SHIFT(ts_sym_object, 1) SHIFT(ts_sym_string, 1) + SHIFT(ts_sym_true, 1) SHIFT(ts_sym_value, 2) SHIFT(ts_sym_left_brace, 3) SHIFT(ts_sym_left_bracket, 55) @@ -223,23 +296,26 @@ PARSE_TABLE() { END_STATE(); STATE(3); - SET_LEX_STATE(20); + SET_LEX_STATE(33); SHIFT(ts_sym_string, 4) SHIFT(ts_sym_right_brace, 51) SHIFT(ts_builtin_sym_error, 52) END_STATE(); STATE(4); - SET_LEX_STATE(18); + SET_LEX_STATE(31); SHIFT(ts_sym_colon, 5) END_STATE(); STATE(5); SET_LEX_STATE(8); SHIFT(ts_sym_array, 6) + SHIFT(ts_sym_false, 6) + SHIFT(ts_sym_null, 6) SHIFT(ts_sym_number, 6) SHIFT(ts_sym_object, 6) SHIFT(ts_sym_string, 6) + SHIFT(ts_sym_true, 6) SHIFT(ts_sym_value, 7) SHIFT(ts_sym_left_brace, 13) SHIFT(ts_sym_left_bracket, 19) @@ -259,22 +335,25 @@ PARSE_TABLE() { END_STATE(); STATE(8); - SET_LEX_STATE(22); + SET_LEX_STATE(35); SHIFT(ts_sym_string, 9) SHIFT(ts_builtin_sym_error, 47) END_STATE(); STATE(9); - SET_LEX_STATE(18); + SET_LEX_STATE(31); SHIFT(ts_sym_colon, 10) END_STATE(); STATE(10); SET_LEX_STATE(8); SHIFT(ts_sym_array, 6) + SHIFT(ts_sym_false, 6) + SHIFT(ts_sym_null, 6) SHIFT(ts_sym_number, 6) SHIFT(ts_sym_object, 6) SHIFT(ts_sym_string, 6) + SHIFT(ts_sym_true, 6) SHIFT(ts_sym_value, 11) SHIFT(ts_sym_left_brace, 13) SHIFT(ts_sym_left_bracket, 19) @@ -293,23 +372,26 @@ PARSE_TABLE() { END_STATE(); STATE(13); - SET_LEX_STATE(20); + SET_LEX_STATE(33); SHIFT(ts_sym_string, 14) SHIFT(ts_sym_right_brace, 43) SHIFT(ts_builtin_sym_error, 44) END_STATE(); STATE(14); - SET_LEX_STATE(18); + SET_LEX_STATE(31); SHIFT(ts_sym_colon, 15) END_STATE(); STATE(15); SET_LEX_STATE(8); SHIFT(ts_sym_array, 6) + SHIFT(ts_sym_false, 6) + SHIFT(ts_sym_null, 6) SHIFT(ts_sym_number, 6) SHIFT(ts_sym_object, 6) SHIFT(ts_sym_string, 6) + SHIFT(ts_sym_true, 6) SHIFT(ts_sym_value, 16) SHIFT(ts_sym_left_brace, 13) SHIFT(ts_sym_left_bracket, 19) @@ -334,11 +416,14 @@ PARSE_TABLE() { END_STATE(); STATE(19); - SET_LEX_STATE(21); + SET_LEX_STATE(34); SHIFT(ts_sym_array, 20) + SHIFT(ts_sym_false, 20) + SHIFT(ts_sym_null, 20) SHIFT(ts_sym_number, 20) SHIFT(ts_sym_object, 20) SHIFT(ts_sym_string, 20) + SHIFT(ts_sym_true, 20) SHIFT(ts_sym_value, 21) SHIFT(ts_sym_left_brace, 25) SHIFT(ts_sym_left_bracket, 35) @@ -362,9 +447,12 @@ PARSE_TABLE() { STATE(22); SET_LEX_STATE(8); SHIFT(ts_sym_array, 20) + SHIFT(ts_sym_false, 20) + SHIFT(ts_sym_null, 20) SHIFT(ts_sym_number, 20) SHIFT(ts_sym_object, 20) SHIFT(ts_sym_string, 20) + SHIFT(ts_sym_true, 20) SHIFT(ts_sym_value, 23) SHIFT(ts_sym_left_brace, 25) SHIFT(ts_sym_left_bracket, 35) @@ -384,23 +472,26 @@ PARSE_TABLE() { END_STATE(); STATE(25); - SET_LEX_STATE(20); + SET_LEX_STATE(33); SHIFT(ts_sym_string, 26) SHIFT(ts_sym_right_brace, 31) SHIFT(ts_builtin_sym_error, 32) END_STATE(); STATE(26); - SET_LEX_STATE(18); + SET_LEX_STATE(31); SHIFT(ts_sym_colon, 27) END_STATE(); STATE(27); SET_LEX_STATE(8); SHIFT(ts_sym_array, 6) + SHIFT(ts_sym_false, 6) + SHIFT(ts_sym_null, 6) SHIFT(ts_sym_number, 6) SHIFT(ts_sym_object, 6) SHIFT(ts_sym_string, 6) + SHIFT(ts_sym_true, 6) SHIFT(ts_sym_value, 28) SHIFT(ts_sym_left_brace, 13) SHIFT(ts_sym_left_bracket, 19) @@ -449,11 +540,14 @@ PARSE_TABLE() { END_STATE(); STATE(35); - SET_LEX_STATE(21); + SET_LEX_STATE(34); SHIFT(ts_sym_array, 20) + SHIFT(ts_sym_false, 20) + SHIFT(ts_sym_null, 20) SHIFT(ts_sym_number, 20) SHIFT(ts_sym_object, 20) SHIFT(ts_sym_string, 20) + SHIFT(ts_sym_true, 20) SHIFT(ts_sym_value, 36) SHIFT(ts_sym_left_brace, 25) SHIFT(ts_sym_left_bracket, 35) @@ -571,11 +665,14 @@ PARSE_TABLE() { END_STATE(); STATE(55); - SET_LEX_STATE(21); + SET_LEX_STATE(34); SHIFT(ts_sym_array, 20) + SHIFT(ts_sym_false, 20) + SHIFT(ts_sym_null, 20) SHIFT(ts_sym_number, 20) SHIFT(ts_sym_object, 20) SHIFT(ts_sym_string, 20) + SHIFT(ts_sym_true, 20) SHIFT(ts_sym_value, 56) SHIFT(ts_sym_left_brace, 25) SHIFT(ts_sym_left_bracket, 35) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index ce13ca8f..1624bdd0 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -351,7 +351,11 @@ static int ts_lr_parser_handle_error(ts_lr_parser *parser) { for (;;) { ts_tree_release(parser->lookahead); + size_t position = ts_lexer_position(&parser->lexer); parser->lookahead = ts_lex(&parser->lexer, ts_lex_state_error); + if (ts_lexer_position(&parser->lexer) == position) + ts_lexer_advance(&parser->lexer); + if (parser->lookahead->symbol == ts_builtin_sym_end) { parser->stack.entries[0].node = error; return 0; diff --git a/spec/runtime/languages/json/errors.txt b/spec/runtime/languages/json/errors.txt index 9a6b5362..50428707 100644 --- a/spec/runtime/languages/json/errors.txt +++ b/spec/runtime/languages/json/errors.txt @@ -1,13 +1,20 @@ -============================== +========================================== recovers from top-level errors -============================== +========================================== [} --- (ERROR) -================================== +========================================== +recovers from unexpected tokens +========================================== +barf +--- +(ERROR) + +========================================== recovers from errors inside arrays -================================== +========================================== [1,,2] --- (value (array @@ -15,9 +22,9 @@ recovers from errors inside arrays (ERROR) (value (number)))) -================================== +========================================== recovers from errors inside objects -================================== +========================================== { "key1": 1, 5 } --- (value (object (string) (value (number)) (ERROR))) diff --git a/spec/runtime/languages/json/main.txt b/spec/runtime/languages/json/main.txt index d7e16313..e3b80c42 100644 --- a/spec/runtime/languages/json/main.txt +++ b/spec/runtime/languages/json/main.txt @@ -16,14 +16,18 @@ parses empty arrays parses arrays =================== [ - 1, 2, 3, + 333, + null, + true, + false, { "stuff": "good" } ] --- (value (array (value (number)) - (value (number)) - (value (number)) + (value (null)) + (value (true)) + (value (false)) (value (object (string) (value (string)) ))