Merge pull request #7 from maxbrunsfeld/more-string-escaping

Escape backslashes and quotes in symbol name strings
2015-11-09 09:36:53 -08:00 · 2015-11-09 09:36:53 -08:00 · d6f87fbb6b
commit d6f87fbb6b
parent 84f939ab4d e11515fb74
4 changed files with 191 additions and 0 deletions
--- a/spec/compiler/compile_examples.cc
+++ b/spec/compiler/compile_examples.cc
@ -16,6 +16,7 @@ extern const Grammar json;
 extern const Grammar golang;
 extern const Grammar c;
 extern const Grammar cpp;
+extern const Grammar anonymous_tokens;

 }  // namespace tree_sitter_examples

@ -39,12 +40,17 @@ describe("compiling the example grammars", []() {
    });
  };

+  // example languages
  compile_grammar(tree_sitter_examples::arithmetic, "arithmetic");
  compile_grammar(tree_sitter_examples::json, "json");
  compile_grammar(tree_sitter_examples::javascript, "javascript");
  compile_grammar(tree_sitter_examples::golang, "golang");
  compile_grammar(tree_sitter_examples::c, "c");
  compile_grammar(tree_sitter_examples::cpp, "cpp");
+
+  // edge cases
+  compile_grammar(tree_sitter_examples::anonymous_tokens, "anonymous_tokens");
+
 });

 END_TEST
--- a/spec/fixtures/grammars/anonymous_tokens.cc
+++ b/spec/fixtures/grammars/anonymous_tokens.cc
@ -0,0 +1,16 @@
+#include "tree_sitter/compiler.h"
+#include "helpers.h"
+
+namespace tree_sitter_examples {
+
+extern const Grammar anonymous_tokens = Grammar({
+  { "program", choice({
+    str("\n"),
+    str("\r"),
+    pattern("\\d"),
+    str("\"hello\"") }) },
+}).ubiquitous_tokens({
+  pattern("\\s"),
+});
+
+}  // namespace tree_sitter_examples
--- a/spec/fixtures/parsers/anonymous_tokens.c
+++ b/spec/fixtures/parsers/anonymous_tokens.c
@ -0,0 +1,167 @@
+#include "tree_sitter/parser.h"
+
+#define STATE_COUNT 3
+#define SYMBOL_COUNT 7
+
+enum {
+    sym_program = ts_builtin_sym_start,
+    anon_sym_LF,
+    anon_sym_CR,
+    aux_sym_SLASH_BSLASHd_SLASH,
+    anon_sym_DQUOTEhello_DQUOTE,
+};
+
+static const char *ts_symbol_names[] = {
+    [sym_program] = "program",
+    [ts_builtin_sym_error] = "ERROR",
+    [ts_builtin_sym_end] = "END",
+    [anon_sym_LF] = "\n",
+    [anon_sym_CR] = "\r",
+    [aux_sym_SLASH_BSLASHd_SLASH] = "/\\d/",
+    [anon_sym_DQUOTEhello_DQUOTE] = "\"hello\"",
+};
+
+static const TSNodeType ts_node_types[SYMBOL_COUNT] = {
+    [sym_program] = TSNodeTypeNamed,
+    [ts_builtin_sym_error] = TSNodeTypeNamed,
+    [ts_builtin_sym_end] = TSNodeTypeHidden,
+    [anon_sym_LF] = TSNodeTypeAnonymous,
+    [anon_sym_CR] = TSNodeTypeAnonymous,
+    [aux_sym_SLASH_BSLASHd_SLASH] = TSNodeTypeHidden,
+    [anon_sym_DQUOTEhello_DQUOTE] = TSNodeTypeAnonymous,
+};
+
+static TSTree *ts_lex(TSLexer *lexer, TSStateId lex_state) {
+    START_LEXER();
+    switch (lex_state) {
+        case 1:
+            START_TOKEN();
+            if ((lookahead == '\t') ||
+                (lookahead == ' '))
+                ADVANCE(1);
+            if (lookahead == '\n')
+                ADVANCE(2);
+            if (lookahead == '\r')
+                ADVANCE(3);
+            if (lookahead == '\"')
+                ADVANCE(4);
+            if ('0' <= lookahead && lookahead <= '9')
+                ADVANCE(11);
+            LEX_ERROR();
+        case 2:
+            START_TOKEN();
+            ACCEPT_TOKEN(anon_sym_LF);
+        case 3:
+            START_TOKEN();
+            ACCEPT_TOKEN(anon_sym_CR);
+        case 4:
+            if (lookahead == 'h')
+                ADVANCE(5);
+            LEX_ERROR();
+        case 5:
+            if (lookahead == 'e')
+                ADVANCE(6);
+            LEX_ERROR();
+        case 6:
+            if (lookahead == 'l')
+                ADVANCE(7);
+            LEX_ERROR();
+        case 7:
+            if (lookahead == 'l')
+                ADVANCE(8);
+            LEX_ERROR();
+        case 8:
+            if (lookahead == 'o')
+                ADVANCE(9);
+            LEX_ERROR();
+        case 9:
+            if (lookahead == '\"')
+                ADVANCE(10);
+            LEX_ERROR();
+        case 10:
+            ACCEPT_TOKEN(anon_sym_DQUOTEhello_DQUOTE);
+        case 11:
+            ACCEPT_TOKEN(aux_sym_SLASH_BSLASHd_SLASH);
+        case 12:
+            START_TOKEN();
+            if (lookahead == 0)
+                ADVANCE(13);
+            if ((lookahead == '\t') ||
+                (lookahead == '\n') ||
+                (lookahead == '\r') ||
+                (lookahead == ' '))
+                ADVANCE(12);
+            LEX_ERROR();
+        case 13:
+            ACCEPT_TOKEN(ts_builtin_sym_end);
+        case 14:
+            START_TOKEN();
+            if (lookahead == 0)
+                ADVANCE(13);
+            if ((lookahead == '\t') ||
+                (lookahead == ' '))
+                ADVANCE(14);
+            if (lookahead == '\n')
+                ADVANCE(15);
+            if (lookahead == '\r')
+                ADVANCE(16);
+            if (lookahead == '\"')
+                ADVANCE(4);
+            if ('0' <= lookahead && lookahead <= '9')
+                ADVANCE(11);
+            LEX_ERROR();
+        case 15:
+            START_TOKEN();
+            ACCEPT_TOKEN(anon_sym_LF);
+        case 16:
+            START_TOKEN();
+            ACCEPT_TOKEN(anon_sym_CR);
+        case ts_lex_state_error:
+            START_TOKEN();
+            if (lookahead == 0)
+                ADVANCE(13);
+            if ((lookahead == '\t') ||
+                (lookahead == ' '))
+                ADVANCE(14);
+            if (lookahead == '\n')
+                ADVANCE(15);
+            if (lookahead == '\r')
+                ADVANCE(16);
+            if (lookahead == '\"')
+                ADVANCE(4);
+            if ('0' <= lookahead && lookahead <= '9')
+                ADVANCE(11);
+            LEX_ERROR();
+        default:
+            LEX_ERROR();
+    }
+}
+
+static TSStateId ts_lex_states[STATE_COUNT] = {
+    [0] = 1,
+    [1] = 12,
+    [2] = 12,
+};
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+
+static const TSParseAction *ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {
+    [0] = {
+        [sym_program] = ACTIONS(SHIFT(1)),
+        [anon_sym_LF] = ACTIONS(SHIFT(2)),
+        [anon_sym_CR] = ACTIONS(SHIFT(2)),
+        [aux_sym_SLASH_BSLASHd_SLASH] = ACTIONS(SHIFT(2)),
+        [anon_sym_DQUOTEhello_DQUOTE] = ACTIONS(SHIFT(2)),
+    },
+    [1] = {
+        [ts_builtin_sym_end] = ACTIONS(ACCEPT_INPUT()),
+    },
+    [2] = {
+        [ts_builtin_sym_end] = ACTIONS(REDUCE(sym_program, 1)),
+    },
+};
+
+#pragma GCC diagnostic pop
+
+EXPORT_LANGUAGE(ts_language_anonymous_tokens);
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -411,8 +411,10 @@ class CCodeGenerator {
  }

  string sanitize_name_for_string(string name) {
+    util::str_replace(&name, "\\", "\\\\");
    util::str_replace(&name, "\n", "\\n");
    util::str_replace(&name, "\r", "\\r");
+    util::str_replace(&name, "\"", "\\\"");
    return name;
  }