Start work on external tokens

This commit is contained in:
Max Brunsfeld 2016-11-30 09:34:47 -08:00
parent 46854cc274
commit c966af0412
47 changed files with 723 additions and 417 deletions

View file

@ -48,6 +48,11 @@ typedef struct {
bool fragile : 1;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_tokens;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
@ -64,8 +69,15 @@ typedef struct TSLanguage {
const TSSymbolMetadata *symbol_metadata;
const unsigned short *parse_table;
const TSParseActionEntry *parse_actions;
const TSStateId *lex_states;
const TSLexMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
const TSSymbol *external_token_symbol_map;
const bool *external_token_lists;
struct {
void * (*create)();
bool (*scan)(TSLexer *, const bool *symbol_whitelist);
void (*destroy)(void *);
} external_scanner;
} TSLanguage;
/*
@ -146,21 +158,22 @@ typedef struct TSLanguage {
{ .type = TSParseActionTypeAccept } \
}
#define EXPORT_LANGUAGE(language_name) \
static TSLanguage language = { \
.symbol_count = SYMBOL_COUNT, \
.token_count = TOKEN_COUNT, \
.symbol_metadata = ts_symbol_metadata, \
.parse_table = (const unsigned short *)ts_parse_table, \
.parse_actions = ts_parse_actions, \
.lex_states = ts_lex_states, \
.symbol_names = ts_symbol_names, \
.lex_fn = ts_lex, \
}; \
\
const TSLanguage *language_name() { \
return &language; \
}
#define GET_LANGUAGE(...) \
static TSLanguage language = { \
.symbol_count = SYMBOL_COUNT, \
.token_count = TOKEN_COUNT, \
.symbol_metadata = ts_symbol_metadata, \
.parse_table = (const unsigned short *)ts_parse_table, \
.parse_actions = ts_parse_actions, \
.lex_modes = ts_lex_modes, \
.symbol_names = ts_symbol_names, \
.lex_fn = ts_lex, \
.external_token_lists = (const bool *)ts_external_token_lists, \
.external_token_symbol_map = ts_external_token_symbol_map, \
.external_scanner = {__VA_ARGS__} \
}; \
return &language \
#ifdef __cplusplus
}

View file

@ -47,6 +47,7 @@
'src/compiler/rules/character_range.cc',
'src/compiler/rules/character_set.cc',
'src/compiler/rules/choice.cc',
'src/compiler/rules/external_token.cc',
'src/compiler/rules/metadata.cc',
'src/compiler/rules/named_symbol.cc',
'src/compiler/rules/pattern.cc',

View file

@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() {
})),
};
AssertThat(recovery_tokens(grammar), Equals<set<Symbol::Index>>({ 1 }));
AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
});
});

View file

@ -14,10 +14,10 @@ START_TEST
describe("LexConflictManager::resolve(new_action, old_action)", []() {
LexConflictManager conflict_manager;
bool update;
Symbol sym1(0, true);
Symbol sym2(1, true);
Symbol sym3(2, true);
Symbol sym4(3, true);
Symbol sym1(0, Symbol::Terminal);
Symbol sym2(1, Symbol::Terminal);
Symbol sym3(2, Symbol::Terminal);
Symbol sym4(3, Symbol::Terminal);
LexItemSet item_set({ LexItem(sym4, blank() )});
it("favors advance actions over empty accept token actions", [&]() {

View file

@ -14,7 +14,7 @@ START_TEST
describe("LexItem", []() {
describe("completion_status()", [&]() {
it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
LexItem item1(Symbol(0, true), character({ 'a', 'b', 'c' }));
LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
AssertThat(item1.completion_status().is_done, IsFalse());
AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
AssertThat(item1.completion_status().is_string, IsFalse());
@ -23,7 +23,7 @@ describe("LexItem", []() {
params.precedence = 3;
params.has_precedence = true;
params.is_string = 1;
LexItem item2(Symbol(0, true), choice({
LexItem item2(Symbol(0, Symbol::Terminal), choice({
metadata(blank(), params),
character({ 'a', 'b', 'c' })
}));
@ -32,7 +32,7 @@ describe("LexItem", []() {
AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
AssertThat(item2.completion_status().is_string, IsTrue());
LexItem item3(Symbol(0, true), repeat(character({ ' ', '\t' })));
LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
AssertThat(item3.completion_status().is_done, IsTrue());
AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
AssertThat(item3.completion_status().is_string, IsFalse());
@ -43,7 +43,7 @@ describe("LexItem", []() {
describe("LexItemSet::transitions()", [&]() {
it("handles single characters", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), character({ 'x' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
});
AssertThat(
@ -53,7 +53,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('x'),
Transition{
LexItemSet({
LexItem(Symbol(1), blank()),
LexItem(Symbol(1, Symbol::NonTerminal), blank()),
}),
PrecedenceRange(),
false
@ -67,7 +67,7 @@ describe("LexItemSet::transitions()", [&]() {
params.is_main_token = true;
LexItemSet item_set({
LexItem(Symbol(1), metadata(character({ 'x' }), params)),
LexItem(Symbol(1, Symbol::NonTerminal), metadata(character({ 'x' }), params)),
});
AssertThat(
@ -77,7 +77,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('x'),
Transition{
LexItemSet({
LexItem(Symbol(1), metadata(blank(), params)),
LexItem(Symbol(1, Symbol::NonTerminal), metadata(blank(), params)),
}),
PrecedenceRange(),
true
@ -88,7 +88,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles sequences", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
character({ 'w' }),
character({ 'x' }),
character({ 'y' }),
@ -103,7 +103,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('w'),
Transition{
LexItemSet({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
@ -118,7 +118,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles sequences with nested precedence", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
prec(3, seq({
character({ 'v' }),
prec(4, seq({
@ -140,7 +140,7 @@ describe("LexItemSet::transitions()", [&]() {
// The outer precedence is now 'active', because we are within its
// contained rule.
LexItemSet({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
active_prec(3, seq({
prec(4, seq({
character({ 'w' }),
@ -168,7 +168,7 @@ describe("LexItemSet::transitions()", [&]() {
Transition{
// The inner precedence is now 'active'
LexItemSet({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
active_prec(3, seq({
active_prec(4, character({ 'x' })),
character({ 'y' }) })),
@ -193,7 +193,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('x'),
Transition{
LexItemSet({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
active_prec(3, character({ 'y' })),
character({ 'z' }),
})),
@ -216,7 +216,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('y'),
Transition{
LexItemSet({
LexItem(Symbol(1), character({ 'z' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
}),
PrecedenceRange(3),
false
@ -227,7 +227,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles sequences where the left hand side can be blank", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
choice({
character({ 'x' }),
blank(),
@ -244,7 +244,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('x'),
Transition{
LexItemSet({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
character({ 'y' }),
character({ 'z' }),
})),
@ -257,7 +257,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('y'),
Transition{
LexItemSet({
LexItem(Symbol(1), character({ 'z' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'z' })),
}),
PrecedenceRange(),
false
@ -268,7 +268,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles blanks", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), blank()),
LexItem(Symbol(1, Symbol::NonTerminal), blank()),
});
AssertThat(item_set.transitions(), IsEmpty());
@ -276,11 +276,11 @@ describe("LexItemSet::transitions()", [&]() {
it("handles repeats", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), repeat1(seq({
LexItem(Symbol(1, Symbol::NonTerminal), repeat1(seq({
character({ 'a' }),
character({ 'b' }),
}))),
LexItem(Symbol(2), repeat1(character({ 'c' }))),
LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
});
AssertThat(
@ -290,14 +290,14 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('a'),
Transition{
LexItemSet({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
character({ 'b' }),
repeat1(seq({
character({ 'a' }),
character({ 'b' }),
}))
})),
LexItem(Symbol(1), character({ 'b' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'b' })),
}),
PrecedenceRange(),
false
@ -307,8 +307,8 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('c'),
Transition{
LexItemSet({
LexItem(Symbol(2), repeat1(character({ 'c' }))),
LexItem(Symbol(2), blank()),
LexItem(Symbol(2, Symbol::NonTerminal), repeat1(character({ 'c' }))),
LexItem(Symbol(2, Symbol::NonTerminal), blank()),
}),
PrecedenceRange(),
false
@ -319,7 +319,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles repeats with precedence", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' }))))
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' }))))
});
AssertThat(
@ -329,8 +329,8 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('a'),
Transition{
LexItemSet({
LexItem(Symbol(1), active_prec(-1, repeat1(character({ 'a' })))),
LexItem(Symbol(1), active_prec(-1, blank())),
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, repeat1(character({ 'a' })))),
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(-1, blank())),
}),
PrecedenceRange(-1),
false
@ -341,7 +341,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles choices between overlapping character sets", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), choice({
LexItem(Symbol(1, Symbol::NonTerminal), choice({
active_prec(2, seq({
character({ 'a', 'b', 'c', 'd' }),
character({ 'x' }),
@ -360,7 +360,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('a', 'b'),
Transition{
LexItemSet({
LexItem(Symbol(1), active_prec(2, character({ 'x' }))),
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
}),
PrecedenceRange(2),
false
@ -370,8 +370,8 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('c', 'd'),
Transition{
LexItemSet({
LexItem(Symbol(1), active_prec(2, character({ 'x' }))),
LexItem(Symbol(1), active_prec(3, character({ 'y' }))),
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(2, character({ 'x' }))),
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
}),
PrecedenceRange(2, 3),
false
@ -381,7 +381,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('e', 'f'),
Transition{
LexItemSet({
LexItem(Symbol(1), active_prec(3, character({ 'y' }))),
LexItem(Symbol(1, Symbol::NonTerminal), active_prec(3, character({ 'y' }))),
}),
PrecedenceRange(3),
false
@ -392,7 +392,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles choices between a subset and a superset of characters", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), choice({
LexItem(Symbol(1, Symbol::NonTerminal), choice({
seq({
character({ 'b', 'c', 'd' }),
character({ 'x' }),
@ -411,7 +411,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('a').include('e', 'f'),
Transition{
LexItemSet({
LexItem(Symbol(1), character({ 'y' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
}),
PrecedenceRange(),
false
@ -421,8 +421,8 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('b', 'd'),
Transition{
LexItemSet({
LexItem(Symbol(1), character({ 'x' })),
LexItem(Symbol(1), character({ 'y' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'x' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'y' })),
}),
PrecedenceRange(),
false
@ -433,7 +433,7 @@ describe("LexItemSet::transitions()", [&]() {
it("handles choices between whitelisted and blacklisted character sets", [&]() {
LexItemSet item_set({
LexItem(Symbol(1), seq({
LexItem(Symbol(1, Symbol::NonTerminal), seq({
choice({
character({ '/' }, false),
seq({
@ -452,7 +452,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include_all().exclude('/').exclude('\\'),
Transition{
LexItemSet({
LexItem(Symbol(1), character({ '/' })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
}),
PrecedenceRange(),
false
@ -462,8 +462,8 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('\\'),
Transition{
LexItemSet({
LexItem(Symbol(1), character({ '/' })),
LexItem(Symbol(1), seq({ character({ '/' }), character({ '/' }) })),
LexItem(Symbol(1, Symbol::NonTerminal), character({ '/' })),
LexItem(Symbol(1, Symbol::NonTerminal), seq({ character({ '/' }), character({ '/' }) })),
}),
PrecedenceRange(),
false
@ -474,8 +474,8 @@ describe("LexItemSet::transitions()", [&]() {
it("handles different items with overlapping character sets", [&]() {
LexItemSet set1({
LexItem(Symbol(1), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
LexItem(Symbol(2), character({ 'e', 'f', 'g', 'h', 'i' }))
LexItem(Symbol(1, Symbol::NonTerminal), character({ 'a', 'b', 'c', 'd', 'e', 'f' })),
LexItem(Symbol(2, Symbol::NonTerminal), character({ 'e', 'f', 'g', 'h', 'i' }))
});
AssertThat(set1.transitions(), Equals(LexItemSet::TransitionMap({
@ -483,7 +483,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('a', 'd'),
Transition{
LexItemSet({
LexItem(Symbol(1), blank()),
LexItem(Symbol(1, Symbol::NonTerminal), blank()),
}),
PrecedenceRange(),
false
@ -493,8 +493,8 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('e', 'f'),
Transition{
LexItemSet({
LexItem(Symbol(1), blank()),
LexItem(Symbol(2), blank()),
LexItem(Symbol(1, Symbol::NonTerminal), blank()),
LexItem(Symbol(2, Symbol::NonTerminal), blank()),
}),
PrecedenceRange(),
false
@ -504,7 +504,7 @@ describe("LexItemSet::transitions()", [&]() {
CharacterSet().include('g', 'i'),
Transition{
LexItemSet({
LexItem(Symbol(2), blank()),
LexItem(Symbol(2, Symbol::NonTerminal), blank()),
}),
PrecedenceRange(),
false

View file

@ -27,23 +27,23 @@ describe("ParseItemSetBuilder", []() {
SyntaxGrammar grammar{{
SyntaxVariable("rule0", VariableTypeNamed, {
Production({
{Symbol(1), 0, AssociativityNone},
{Symbol(11, true), 0, AssociativityNone},
{Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(11, Symbol::Terminal), 0, AssociativityNone},
}),
}),
SyntaxVariable("rule1", VariableTypeNamed, {
Production({
{Symbol(12, true), 0, AssociativityNone},
{Symbol(13, true), 0, AssociativityNone},
{Symbol(12, Symbol::Terminal), 0, AssociativityNone},
{Symbol(13, Symbol::Terminal), 0, AssociativityNone},
}),
Production({
{Symbol(2), 0, AssociativityNone},
{Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
})
}),
SyntaxVariable("rule2", VariableTypeNamed, {
Production({
{Symbol(14, true), 0, AssociativityNone},
{Symbol(15, true), 0, AssociativityNone},
{Symbol(14, Symbol::Terminal), 0, AssociativityNone},
{Symbol(15, Symbol::Terminal), 0, AssociativityNone},
})
}),
}, {}, {}};
@ -54,8 +54,8 @@ describe("ParseItemSetBuilder", []() {
ParseItemSet item_set({
{
ParseItem(Symbol(0), production(0, 0), 0),
LookaheadSet({ 10 }),
ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
LookaheadSet({ Symbol(10, Symbol::Terminal) }),
}
});
@ -64,20 +64,20 @@ describe("ParseItemSetBuilder", []() {
AssertThat(item_set, Equals(ParseItemSet({
{
ParseItem(Symbol(0), production(0, 0), 0),
LookaheadSet({ 10 })
ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
LookaheadSet({ Symbol(10, Symbol::Terminal) })
},
{
ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
LookaheadSet({ Symbol(11, Symbol::Terminal) })
},
{
ParseItem(Symbol(1), production(1, 0), 0),
LookaheadSet({ 11 })
ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
LookaheadSet({ Symbol(11, Symbol::Terminal) })
},
{
ParseItem(Symbol(1), production(1, 1), 0),
LookaheadSet({ 11 })
},
{
ParseItem(Symbol(2), production(2, 0), 0),
LookaheadSet({ 11 })
ParseItem(Symbol(2, Symbol::NonTerminal), production(2, 0), 0),
LookaheadSet({ Symbol(11, Symbol::Terminal) })
},
})));
});
@ -86,14 +86,14 @@ describe("ParseItemSetBuilder", []() {
SyntaxGrammar grammar{{
SyntaxVariable("rule0", VariableTypeNamed, {
Production({
{Symbol(1), 0, AssociativityNone},
{Symbol(11, true), 0, AssociativityNone},
{Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(11, Symbol::Terminal), 0, AssociativityNone},
}),
}),
SyntaxVariable("rule1", VariableTypeNamed, {
Production({
{Symbol(12, true), 0, AssociativityNone},
{Symbol(13, true), 0, AssociativityNone},
{Symbol(12, Symbol::Terminal), 0, AssociativityNone},
{Symbol(13, Symbol::Terminal), 0, AssociativityNone},
}),
Production({})
}),
@ -105,8 +105,8 @@ describe("ParseItemSetBuilder", []() {
ParseItemSet item_set({
{
ParseItem(Symbol(0), production(0, 0), 0),
LookaheadSet({ 10 }),
ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
LookaheadSet({ Symbol(10, Symbol::Terminal) }),
}
});
@ -115,16 +115,16 @@ describe("ParseItemSetBuilder", []() {
AssertThat(item_set, Equals(ParseItemSet({
{
ParseItem(Symbol(0), production(0, 0), 0),
LookaheadSet({ 10 })
ParseItem(Symbol(0, Symbol::NonTerminal), production(0, 0), 0),
LookaheadSet({ Symbol(10, Symbol::Terminal) })
},
{
ParseItem(Symbol(1), production(1, 0), 0),
LookaheadSet({ 11 })
ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 0), 0),
LookaheadSet({ Symbol(11, Symbol::Terminal) })
},
{
ParseItem(Symbol(1), production(1, 1), 0),
LookaheadSet({ 11 })
ParseItem(Symbol(1, Symbol::NonTerminal), production(1, 1), 0),
LookaheadSet({ Symbol(11, Symbol::Terminal) })
},
})));
});

View file

@ -133,13 +133,13 @@ describe("extract_tokens", []() {
Variable("rule_A", VariableTypeNamed, str("ok")),
Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
}, { str(" ") }, { { Symbol(1), Symbol(2) } }});
}, { str(" ") }, { { Symbol(1, Symbol::NonTerminal), Symbol(2, Symbol::NonTerminal) } }});
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
AssertThat(syntax_grammar.variables.size(), Equals<size_t>(2));
AssertThat(syntax_grammar.expected_conflicts, Equals(set<set<Symbol>>({
{ Symbol(0), Symbol(1) },
{ Symbol(0, Symbol::NonTerminal), Symbol(1, Symbol::NonTerminal) },
})));
});
@ -171,7 +171,7 @@ describe("extract_tokens", []() {
AssertThat(get<2>(result), Equals(CompileError::none()));
AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, true) })));
AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, Symbol::Terminal) })));
});
it("updates extra symbols according to the new symbol numbers", [&]() {
@ -186,7 +186,7 @@ describe("extract_tokens", []() {
AssertThat(get<2>(result), Equals(CompileError::none()));
AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({
{ Symbol(3, true) },
{ Symbol(3, Symbol::Terminal) },
})));
AssertThat(get<1>(result).separators, IsEmpty());

View file

@ -36,19 +36,19 @@ describe("flatten_grammar", []() {
AssertThat(result.type, Equals(VariableTypeNamed));
AssertThat(result.productions, Equals(vector<Production>({
Production({
{Symbol(1), 0, AssociativityNone},
{Symbol(2), 101, AssociativityLeft},
{Symbol(3), 102, AssociativityRight},
{Symbol(4), 101, AssociativityLeft},
{Symbol(6), 0, AssociativityNone},
{Symbol(7), 0, AssociativityNone},
{Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
{Symbol(3, Symbol::NonTerminal), 102, AssociativityRight},
{Symbol(4, Symbol::NonTerminal), 101, AssociativityLeft},
{Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
}),
Production({
{Symbol(1), 0, AssociativityNone},
{Symbol(2), 101, AssociativityLeft},
{Symbol(5), 101, AssociativityLeft},
{Symbol(6), 0, AssociativityNone},
{Symbol(7), 0, AssociativityNone},
{Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
{Symbol(5, Symbol::NonTerminal), 101, AssociativityLeft},
{Symbol(6, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(7, Symbol::NonTerminal), 0, AssociativityNone},
})
})))
});
@ -65,8 +65,8 @@ describe("flatten_grammar", []() {
AssertThat(result.productions, Equals(vector<Production>({
Production({
{Symbol(1), 101, AssociativityLeft},
{Symbol(2), 101, AssociativityLeft},
{Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
{Symbol(2, Symbol::NonTerminal), 101, AssociativityLeft},
})
})))
@ -80,7 +80,7 @@ describe("flatten_grammar", []() {
AssertThat(result.productions, Equals(vector<Production>({
Production({
{Symbol(1), 101, AssociativityLeft},
{Symbol(1, Symbol::NonTerminal), 101, AssociativityLeft},
})
})))
});

View file

@ -9,7 +9,7 @@ START_TEST
describe("Repeat", []() {
describe("constructing repeats", [&]() {
it("doesn't create redundant repeats", [&]() {
auto sym = make_shared<Symbol>(1);
auto sym = make_shared<Symbol>(1, Symbol::NonTerminal);
auto repeat = Repeat::build(sym);
auto outer_repeat = Repeat::build(repeat);

View file

@ -0,0 +1,13 @@
#include <stdbool.h>
void *ts_language_external_scanner_example_external_scanner_create() {
puts("HELLO FROM EXTERNAL SCANNER");
return 0;
}
bool ts_language_external_scanner_example_external_scanner_scan() {
return true;
}
void ts_language_external_scanner_example_external_scanner_destroy() {
}

View file

@ -67,7 +67,8 @@ static int get_modified_time(const string &path) {
const TSLanguage *load_language(const string &source_filename,
const string &lib_filename,
const string &language_name) {
const string &language_name,
string external_scanner_path = "") {
string language_function_name = "ts_language_" + language_name;
string header_dir = getenv("PWD") + string("/include");
int source_mtime = get_modified_time(source_filename);
@ -119,7 +120,9 @@ const TSLanguage *load_language(const string &source_filename,
return language_fn();
}
const TSLanguage *load_compile_result(const string &name, const TSCompileResult &compile_result) {
const TSLanguage *load_compile_result(const string &name,
const TSCompileResult &compile_result,
string external_scanner_path) {
if (compile_result.error_type != TSCompileErrorTypeNone) {
Assert::Failure(string("Compilation failed ") + compile_result.error_message);
return nullptr;
@ -135,7 +138,7 @@ const TSLanguage *load_compile_result(const string &name, const TSCompileResult
source_file << compile_result.code;
source_file.close();
const TSLanguage *language = load_language(source_filename, lib_filename, name);
auto language = load_language(source_filename, lib_filename, name, external_scanner_path);
free(compile_result.code);
return language;
}

View file

@ -5,7 +5,8 @@
#include "tree_sitter/runtime.h"
#include <string>
const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &);
const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &,
std::string external_scanner_path = "");
const TSLanguage *get_test_language(const std::string &language_name);
#endif // HELPERS_LOAD_LANGUAGE_H_

View file

@ -9,6 +9,7 @@ namespace tree_sitter {
using std::ostream;
using std::string;
using std::to_string;
using rules::Symbol;
rule_ptr character(const set<uint32_t> &ranges) {
return character(ranges, true);
@ -28,11 +29,11 @@ namespace tree_sitter {
}
rule_ptr i_sym(size_t index) {
return make_shared<rules::Symbol>(index);
return make_shared<Symbol>(index, Symbol::NonTerminal);
}
rule_ptr i_token(size_t index) {
return make_shared<rules::Symbol>(index, true);
return make_shared<Symbol>(index, Symbol::Terminal);
}
rule_ptr metadata(rule_ptr rule, rules::MetadataParams params) {

View file

@ -10,16 +10,7 @@ namespace tree_sitter {
ostream &operator<<(ostream &stream, const Grammar &grammar) {
stream << string("#<grammar");
stream << string(" rules: {");
bool started = false;
for (auto pair : grammar.rules) {
if (started)
stream << string(", ");
stream << pair.first;
stream << string(" => ");
stream << pair.second;
started = true;
}
stream << " rules: " << grammar.rules;
return stream << string("}>");
}

View file

@ -507,6 +507,71 @@ describe("compile_grammar", []() {
});
});
describe("external scanners", [&]() {
it("can call out to arbitrary scanner functions during parsing", [&]() {
string grammar = R"JSON({
"name": "external_scanner_example",
"externals": [
"percent_string",
"percent_string_start",
"percent_string_end"
],
"rules": {
"string": {
"type": "CHOICE",
"members": [
{
"type": "EXTERNAL_TOKEN",
"name": "percent_string"
},
{
"type": "SEQ",
"members": [
{
"type": "EXTERNAL_TOKEN",
"name": "percent_string_start"
},
{
"type": "SYMBOL",
"name": "identifier"
},
{
"type": "EXTERNAL_TOKEN",
"name": "percent_string_end"
}
]
},
]
},
"identifier": {
"type": "PATTERN",
"value": "\\a+"
}
}
})JSON";
TSCompileResult result = ts_compile_grammar(grammar.c_str());
AssertThat(result.error_message, IsNull());
ts_document_set_language(document, load_compile_result(
"external_scanner_example",
result,
"spec/fixtures/external_scanners/external_scan.c"
));
ts_document_set_input_string(document, "%|hi|");
ts_document_parse(document);
assert_root_node("(string)");
ts_document_set_input_string(document, "%(1 #{two} three)");
ts_document_parse(document);
assert_root_node("(string (identifier))");
});
});
describe("when the grammar's start symbol is a token", [&]() {
it("parses the token", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON(

View file

@ -80,10 +80,10 @@ START_TEST
describe("The Corpus", []() {
vector<string> test_languages({
"javascript",
// "javascript",
"json",
"c",
"cpp",
// "c",
// "cpp",
});
for (auto &language_name : test_languages) {

View file

@ -64,7 +64,7 @@ class LexTableBuilder {
private:
void add_lex_state_for_parse_state(ParseState *parse_state) {
parse_state->lex_state_id =
add_lex_state(item_set_for_tokens(parse_state->expected_inputs()));
add_lex_state(item_set_for_terminals(parse_state->terminal_entries));
}
LexStateId add_lex_state(const LexItemSet &item_set) {
@ -112,24 +112,27 @@ class LexTableBuilder {
void mark_fragile_tokens() {
for (ParseState &state : parse_table->states) {
for (auto &entry : state.terminal_entries) {
auto homonyms = conflict_manager.possible_homonyms.find(entry.first);
if (homonyms != conflict_manager.possible_homonyms.end())
for (Symbol::Index homonym : homonyms->second)
if (state.terminal_entries.count(homonym)) {
entry.second.reusable = false;
break;
}
Symbol symbol = entry.first;
if (symbol.is_token()) {
auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
if (homonyms != conflict_manager.possible_homonyms.end())
for (Symbol::Index homonym : homonyms->second)
if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
entry.second.reusable = false;
break;
}
if (!entry.second.reusable)
continue;
if (!entry.second.reusable)
continue;
auto extensions = conflict_manager.possible_extensions.find(entry.first);
if (extensions != conflict_manager.possible_extensions.end())
for (Symbol::Index extension : extensions->second)
if (state.terminal_entries.count(extension)) {
entry.second.depends_on_lookahead = true;
break;
}
auto extensions = conflict_manager.possible_extensions.find(symbol.index);
if (extensions != conflict_manager.possible_extensions.end())
for (Symbol::Index extension : extensions->second)
if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
entry.second.depends_on_lookahead = true;
break;
}
}
}
}
}
@ -150,24 +153,27 @@ class LexTableBuilder {
}
}
LexItemSet item_set_for_tokens(const set<Symbol> &symbols) {
LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
LexItemSet result;
for (const Symbol &symbol : symbols)
for (const rule_ptr &rule : rules_for_symbol(symbol))
for (const rule_ptr &separator_rule : separator_rules)
result.entries.insert(LexItem(
symbol,
Metadata::separator(
Seq::build({
separator_rule,
Metadata::main_token(rule) }))));
for (const auto &pair : terminals) {
Symbol symbol = pair.first;
if (symbol.is_token()) {
for (const rule_ptr &rule : rules_for_symbol(symbol)) {
for (const rule_ptr &separator_rule : separator_rules) {
result.entries.insert(LexItem(
symbol,
Metadata::separator(
Seq::build({
separator_rule,
Metadata::main_token(rule) }))));
}
}
}
}
return result;
}
vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
if (!symbol.is_token)
return {};
if (symbol == rules::END_OF_INPUT())
return { CharacterSet().include(0).copy() };

View file

@ -52,7 +52,10 @@ class ParseTableBuilder {
allow_any_conflict(false) {}
pair<ParseTable, CompileError> build() {
Symbol start_symbol = Symbol(0, grammar.variables.empty());
Symbol start_symbol = grammar.variables.empty() ?
Symbol(0, Symbol::Terminal) :
Symbol(0, Symbol::NonTerminal);
Production start_production({
ProductionStep(start_symbol, 0, rules::AssociativityNone),
});
@ -63,7 +66,7 @@ class ParseTableBuilder {
add_parse_state(ParseItemSet({
{
ParseItem(rules::START(), start_production, 0),
LookaheadSet({ END_OF_INPUT().index }),
LookaheadSet({ END_OF_INPUT() }),
},
}));
@ -107,21 +110,21 @@ class ParseTableBuilder {
void build_error_parse_state() {
ParseState error_state;
for (const Symbol::Index index : parse_table.mergeable_symbols) {
add_out_of_context_parse_state(&error_state, Symbol(index, true));
for (const Symbol symbol : parse_table.mergeable_symbols) {
add_out_of_context_parse_state(&error_state, symbol);
}
for (const Symbol &symbol : grammar.extra_tokens) {
if (!error_state.terminal_entries.count(symbol.index)) {
error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra());
if (!error_state.terminal_entries.count(symbol)) {
error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra());
}
}
for (size_t i = 0; i < grammar.variables.size(); i++) {
add_out_of_context_parse_state(&error_state, Symbol(i, false));
add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::NonTerminal));
}
error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0));
error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0));
parse_table.states[0] = error_state;
}
@ -130,10 +133,10 @@ class ParseTableBuilder {
const ParseItemSet &item_set = recovery_states[symbol];
if (!item_set.entries.empty()) {
ParseStateId state = add_parse_state(item_set);
if (symbol.is_token) {
error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) });
} else {
if (symbol.is_non_terminal()) {
error_state->nonterminal_entries[symbol.index] = state;
} else {
error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) });
}
}
}
@ -152,9 +155,9 @@ class ParseTableBuilder {
}
string add_actions(const ParseItemSet &item_set, ParseStateId state_id) {
map<Symbol::Index, ParseItemSet> terminal_successors;
map<Symbol, ParseItemSet> terminal_successors;
map<Symbol::Index, ParseItemSet> nonterminal_successors;
set<Symbol::Index> lookaheads_with_conflicts;
set<Symbol> lookaheads_with_conflicts;
for (const auto &pair : item_set.entries) {
const ParseItem &item = pair.first;
@ -168,7 +171,7 @@ class ParseTableBuilder {
ParseAction::Reduce(item.lhs(), item.step_index, *item.production);
int precedence = item.precedence();
for (const Symbol::Index lookahead : *lookahead_symbols.entries) {
for (Symbol lookahead : *lookahead_symbols.entries) {
ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
// Only add the highest-precedence Reduce actions to the parse table.
@ -203,10 +206,10 @@ class ParseTableBuilder {
Symbol symbol = item.production->at(item.step_index).symbol;
ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);
if (symbol.is_token) {
terminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
} else {
if (symbol.is_non_terminal()) {
nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols;
} else {
terminal_successors[symbol].entries[new_item] = lookahead_symbols;
}
}
}
@ -214,7 +217,7 @@ class ParseTableBuilder {
// Add a Shift action for each possible successor state. Shift actions for
// terminal lookaheads can conflict with Reduce actions added previously.
for (auto &pair : terminal_successors) {
Symbol::Index lookahead = pair.first;
Symbol lookahead = pair.first;
ParseItemSet &next_item_set = pair.second;
ParseStateId next_state_id = add_parse_state(next_item_set);
ParseState &state = parse_table.states[state_id];
@ -223,7 +226,7 @@ class ParseTableBuilder {
if (!allow_any_conflict) {
if (had_existing_action)
lookaheads_with_conflicts.insert(lookahead);
recovery_states[Symbol(lookahead, true)].add(next_item_set);
recovery_states[lookahead].add(next_item_set);
}
}
@ -234,10 +237,10 @@ class ParseTableBuilder {
ParseStateId next_state = add_parse_state(next_item_set);
parse_table.set_nonterminal_action(state_id, lookahead, next_state);
if (!allow_any_conflict)
recovery_states[Symbol(lookahead, false)].add(next_item_set);
recovery_states[Symbol(lookahead, Symbol::NonTerminal)].add(next_item_set);
}
for (Symbol::Index lookahead : lookaheads_with_conflicts) {
for (Symbol lookahead : lookaheads_with_conflicts) {
string conflict = handle_conflict(item_set, state_id, lookahead);
if (!conflict.empty()) return conflict;
}
@ -245,9 +248,9 @@ class ParseTableBuilder {
ParseAction shift_extra = ParseAction::ShiftExtra();
ParseState &state = parse_table.states[state_id];
for (const Symbol &extra_symbol : grammar.extra_tokens) {
if (!state.terminal_entries.count(extra_symbol.index) ||
if (!state.terminal_entries.count(extra_symbol) ||
state.has_shift_action() || allow_any_conflict) {
parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra);
parse_table.add_terminal_action(state_id, extra_symbol, shift_extra);
}
}
@ -257,7 +260,6 @@ class ParseTableBuilder {
void mark_fragile_actions() {
for (ParseState &state : parse_table.states) {
for (auto &entry : state.terminal_entries) {
const Symbol symbol(entry.first, true);
auto &actions = entry.second.actions;
for (ParseAction &action : actions) {
@ -359,7 +361,7 @@ class ParseTableBuilder {
}
string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
Symbol::Index lookahead) {
Symbol lookahead) {
ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
int reduction_precedence = entry.actions.front().precedence();
set<ParseItem> shift_items;
@ -468,7 +470,7 @@ class ParseTableBuilder {
description += " " + symbol_name(earliest_starting_item.production->at(i).symbol);
}
description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026";
description += " \u2022 " + symbol_name(lookahead) + " \u2026";
description += "\n\n";
description += "Possible interpretations:\n\n";
@ -487,7 +489,7 @@ class ParseTableBuilder {
description += " " + symbol_name(step.symbol);
}
description += ")";
description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026";
description += " \u2022 " + symbol_name(lookahead) + " \u2026";
description += "\n";
}
}
@ -564,14 +566,22 @@ class ParseTableBuilder {
return "END_OF_INPUT";
else
return "";
} else if (symbol.is_token) {
const Variable &variable = lexical_grammar.variables[symbol.index];
if (variable.type == VariableTypeNamed)
return variable.name;
else
return "'" + variable.name + "'";
} else {
return grammar.variables[symbol.index].name;
}
switch (symbol.type) {
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
if (variable.type == VariableTypeNamed)
return variable.name;
else
return "'" + variable.name + "'";
}
case Symbol::NonTerminal: {
return grammar.variables[symbol.index].name;
}
case Symbol::External: {
return grammar.external_tokens[symbol.index];
}
}
}

View file

@ -12,8 +12,8 @@ using rules::Symbol;
LookaheadSet::LookaheadSet() : entries(nullptr) {}
LookaheadSet::LookaheadSet(const set<Symbol::Index> &symbols)
: entries(make_shared<set<Symbol::Index>>(symbols)) {}
LookaheadSet::LookaheadSet(const set<Symbol> &symbols)
: entries(make_shared<set<Symbol>>(symbols)) {}
bool LookaheadSet::empty() const {
return !entries.get() || entries->empty();
@ -23,7 +23,7 @@ bool LookaheadSet::operator==(const LookaheadSet &other) const {
return *entries == *other.entries;
}
bool LookaheadSet::contains(const Symbol::Index &symbol) const {
bool LookaheadSet::contains(const Symbol &symbol) const {
return entries->find(symbol) != entries->end();
}
@ -31,15 +31,15 @@ bool LookaheadSet::insert_all(const LookaheadSet &other) {
if (!other.entries.get())
return false;
if (!entries.get())
entries = make_shared<set<Symbol::Index>>();
entries = make_shared<set<Symbol>>();
size_t previous_size = entries->size();
entries->insert(other.entries->begin(), other.entries->end());
return entries->size() > previous_size;
}
bool LookaheadSet::insert(const Symbol::Index &symbol) {
bool LookaheadSet::insert(const Symbol &symbol) {
if (!entries.get())
entries = make_shared<set<Symbol::Index>>();
entries = make_shared<set<Symbol>>();
return entries->insert(symbol).second;
}

View file

@ -11,15 +11,15 @@ namespace build_tables {
class LookaheadSet {
public:
LookaheadSet();
explicit LookaheadSet(const std::set<rules::Symbol::Index> &);
explicit LookaheadSet(const std::set<rules::Symbol> &);
bool empty() const;
bool operator==(const LookaheadSet &) const;
bool contains(const rules::Symbol::Index &) const;
bool contains(const rules::Symbol &) const;
bool insert_all(const LookaheadSet &);
bool insert(const rules::Symbol::Index &);
bool insert(const rules::Symbol &);
std::shared_ptr<std::set<rules::Symbol::Index>> entries;
std::shared_ptr<std::set<rules::Symbol>> entries;
};
} // namespace build_tables

View file

@ -41,7 +41,7 @@ bool ParseItem::operator<(const ParseItem &other) const {
}
Symbol ParseItem::lhs() const {
return Symbol(variable_index);
return Symbol(variable_index, Symbol::NonTerminal);
}
bool ParseItem::is_done() const {
@ -105,38 +105,6 @@ size_t ParseItemSet::unfinished_item_signature() const {
return result;
}
ParseItemSet::ActionMap ParseItemSet::actions() const {
ParseItemSet::ActionMap result;
for (const auto &pair : entries) {
const ParseItem &item = pair.first;
const LookaheadSet &lookahead_symbols = pair.second;
if (item.step_index == item.production->size()) {
int precedence = item.precedence();
for (const Symbol::Index lookahead : *lookahead_symbols.entries) {
Action &action = result.terminal_actions[lookahead];
if (precedence > action.completion_precedence) {
action.completions.assign({ &item });
} else if (precedence == action.completion_precedence) {
action.completions.push_back({ &item });
}
}
} else {
Symbol symbol = item.production->at(item.step_index).symbol;
ParseItem new_item(item.lhs(), *item.production, item.step_index + 1);
if (symbol.is_token) {
result.terminal_actions[symbol.index].continuation.entries[new_item] = lookahead_symbols;
} else {
result.nonterminal_continuations[symbol.index].entries[new_item] = lookahead_symbols;
}
}
}
return result;
}
void ParseItemSet::add(const ParseItemSet &other) {
for (const auto &pair : other.entries)
entries[pair.first].insert_all(pair.second);

View file

@ -41,16 +41,6 @@ class ParseItemSet {
ParseItemSet();
explicit ParseItemSet(const std::map<ParseItem, LookaheadSet> &);
struct Completion;
struct Action;
struct ActionMap {
std::map<rules::Symbol::Index, Action> terminal_actions;
std::map<rules::Symbol::Index, ParseItemSet> nonterminal_continuations;
};
ActionMap actions() const;
bool operator==(const ParseItemSet &) const;
void add(const ParseItemSet &);
size_t unfinished_item_signature() const;
@ -58,22 +48,6 @@ class ParseItemSet {
std::map<ParseItem, LookaheadSet> entries;
};
struct ParseItemSet::Completion {
const ParseItem *item;
int precedence;
rules::Associativity associativity;
bool operator<(const ParseItemSet::Completion &other) {
return precedence < other.precedence;
}
};
struct ParseItemSet::Action {
ParseItemSet continuation;
std::vector<const ParseItem *> completions;
int completion_precedence;
};
} // namespace build_tables
} // namespace tree_sitter

View file

@ -27,12 +27,12 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
set<Symbol::Index> processed_non_terminals;
for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
Symbol symbol(i, true);
first_sets.insert({symbol, LookaheadSet({ static_cast<Symbol::Index>(i) })});
Symbol symbol(i, Symbol::Terminal);
first_sets.insert({symbol, LookaheadSet({ symbol })});
}
for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
Symbol symbol(i);
Symbol symbol(i, Symbol::NonTerminal);
LookaheadSet first_set;
processed_non_terminals.clear();
@ -42,10 +42,10 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
Symbol current_symbol = symbols_to_process.back();
symbols_to_process.pop_back();
if (current_symbol.is_token) {
first_set.insert(current_symbol.index);
if (!current_symbol.is_non_terminal()) {
first_set.insert(current_symbol);
} else if (processed_non_terminals.insert(current_symbol.index).second) {
for (const Production &production : grammar.productions(current_symbol)) {
for (const Production &production : grammar.variables[current_symbol.index].productions) {
if (!production.empty()) {
symbols_to_process.push_back(production[0].symbol);
}
@ -59,11 +59,11 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
vector<ParseItemSetComponent> components_to_process;
for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
Symbol symbol(i);
Symbol symbol(i, Symbol::NonTerminal);
map<ParseItem, pair<LookaheadSet, bool>> cache_entry;
components_to_process.clear();
for (const Production &production : grammar.productions(symbol)) {
for (const Production &production : grammar.variables[i].productions) {
components_to_process.push_back(ParseItemSetComponent{
ParseItem(symbol, production, 0),
LookaheadSet(),
@ -87,7 +87,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
if (component_is_new) {
Symbol next_symbol = item.next_symbol();
if (next_symbol.is_built_in() || next_symbol.is_token)
if (!next_symbol.is_non_terminal() || next_symbol.is_built_in())
continue;
LookaheadSet next_lookaheads;
@ -102,7 +102,7 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
propagates_lookaheads = false;
}
for (const Production &production : grammar.productions(next_symbol)) {
for (const Production &production : grammar.variables[next_symbol.index].productions) {
components_to_process.push_back(ParseItemSetComponent{
ParseItem(next_symbol, production, 0),
next_lookaheads,
@ -130,7 +130,7 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
const LookaheadSet &lookaheads = pair.second;
const Symbol &next_symbol = item.next_symbol();
if (!next_symbol.is_token && !next_symbol.is_built_in()) {
if (next_symbol.is_non_terminal() && !next_symbol.is_built_in()) {
LookaheadSet next_lookaheads;
size_t next_step = item.step_index + 1;
if (next_step == item.production->size()) {

View file

@ -47,8 +47,8 @@ class FirstCharacters : public CharacterAggregator<true, false> {};
class LastCharacters : public CharacterAggregator<false, true> {};
class AllCharacters : public CharacterAggregator<true, true> {};
set<Symbol::Index> recovery_tokens(const LexicalGrammar &grammar) {
set<Symbol::Index> result;
set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
set<Symbol> result;
AllCharacters all_separator_characters;
for (const rule_ptr &separator : grammar.separators)
@ -79,7 +79,7 @@ set<Symbol::Index> recovery_tokens(const LexicalGrammar &grammar) {
!all_characters.result.intersects(all_separator_characters.result);
if ((has_distinct_start && has_distinct_end) || has_no_separators)
result.insert(i);
result.insert(Symbol(i, Symbol::Terminal));
}
return result;

View file

@ -11,7 +11,7 @@ struct LexicalGrammar;
namespace build_tables {
std::set<rules::Symbol::Index> recovery_tokens(const LexicalGrammar &);
std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);
} // namespace build_tables
} // namespace tree_sitter

View file

@ -14,6 +14,7 @@
namespace tree_sitter {
namespace generate_code {
using std::function;
using std::map;
using std::pair;
@ -22,6 +23,7 @@ using std::string;
using std::to_string;
using std::vector;
using util::escape_char;
using rules::Symbol;
static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());
@ -73,9 +75,8 @@ class CCodeGenerator {
const LexicalGrammar lexical_grammar;
map<string, string> sanitized_names;
vector<pair<size_t, ParseTableEntry>> parse_table_entries;
vector<pair<size_t, set<rules::Symbol>>> in_progress_symbols;
vector<set<Symbol::Index>> external_token_id_sets;
size_t next_parse_action_list_index;
size_t next_in_progress_symbol_list_index;
public:
CCodeGenerator(string name, const ParseTable &parse_table,
@ -87,19 +88,25 @@ class CCodeGenerator {
lex_table(lex_table),
syntax_grammar(syntax_grammar),
lexical_grammar(lexical_grammar),
next_parse_action_list_index(0),
next_in_progress_symbol_list_index(0) {}
next_parse_action_list_index(0) {}
string code() {
buffer = "";
add_includes();
add_state_and_symbol_counts();
add_warning_pragma();
add_stats();
add_symbol_enum();
add_symbol_names_list();
add_symbol_node_types_list();
add_symbol_metadata_list();
add_lex_function();
add_lex_states_list();
add_lex_modes_list();
if (!syntax_grammar.external_tokens.empty())
add_external_token_enum();
add_external_token_symbol_map();
add_external_scan_modes_list();
add_parse_table();
add_parser_export();
@ -112,10 +119,17 @@ class CCodeGenerator {
line();
}
void add_state_and_symbol_counts() {
void add_warning_pragma() {
line("#pragma GCC diagnostic push");
line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
line();
}
void add_stats() {
line("#define STATE_COUNT " + to_string(parse_table.states.size()));
line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1));
line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size()));
line();
}
@ -124,7 +138,7 @@ class CCodeGenerator {
indent([&]() {
size_t i = 1;
for (const auto &entry : parse_table.symbols) {
const rules::Symbol &symbol = entry.first;
const Symbol &symbol = entry.first;
if (!symbol.is_built_in()) {
line(symbol_id(symbol) + " = " + to_string(i) + ",");
i++;
@ -146,11 +160,11 @@ class CCodeGenerator {
line();
}
void add_symbol_node_types_list() {
void add_symbol_metadata_list() {
line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {");
indent([&]() {
for (const auto &entry : parse_table.symbols) {
const rules::Symbol &symbol = entry.first;
const Symbol &symbol = entry.first;
line("[" + symbol_id(symbol) + "] = {");
indent([&]() {
switch (symbol_type(symbol)) {
@ -198,13 +212,80 @@ class CCodeGenerator {
line();
}
void add_lex_states_list() {
line("static TSStateId ts_lex_states[STATE_COUNT] = {");
void add_lex_modes_list() {
add_external_tokens_id({});
line("static TSLexMode ts_lex_modes[STATE_COUNT] = {");
indent([&]() {
size_t state_id = 0;
for (const auto &state : parse_table.states)
line("[" + to_string(state_id++) + "] = " +
to_string(state.lex_state_id) + ",");
for (const auto &state : parse_table.states) {
line("[" + to_string(state_id++) + "] = {.lex_state = ");
add(to_string(state.lex_state_id));
set<Symbol::Index> external_token_indices;
for (const auto &pair : state.terminal_entries) {
Symbol symbol = pair.first;
if (symbol.is_external())
external_token_indices.insert(symbol.index);
}
if (!external_token_indices.empty())
add(", .external_tokens = " + add_external_tokens_id(external_token_indices));
add("},");
}
});
line("};");
line();
}
string add_external_tokens_id(set<Symbol::Index> external_token_ids) {
for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++)
if (external_token_id_sets[i] == external_token_ids)
return to_string(i);
external_token_id_sets.push_back(external_token_ids);
return to_string(external_token_id_sets.size() - 1);
}
void add_external_token_enum() {
line("enum {");
indent([&]() {
for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++)
line(external_token_id(i) + ",");
});
line("};");
line();
}
void add_external_token_symbol_map() {
line("TSSymbol ts_external_token_symbol_map[EXTERNAL_TOKEN_COUNT] = {");
indent([&]() {
for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ",");
}
});
line("};");
line();
}
void add_external_scan_modes_list() {
line("static bool ts_external_token_lists[");
add(to_string(external_token_id_sets.size()));
add("][EXTERNAL_TOKEN_COUNT] = {");
indent([&]() {
size_t i = 0;
for (const auto &external_token_ids : external_token_id_sets) {
if (!external_token_ids.empty()) {
line("[" + to_string(i) + "] = {");
indent([&]() {
for (Symbol::Index id : external_token_ids) {
line("[" + external_token_id(id) + "] = true,");
}
});
line("},");
}
i++;
}
});
line("};");
line();
@ -214,9 +295,6 @@ class CCodeGenerator {
add_parse_action_list_id(ParseTableEntry{ {}, false, false });
size_t state_id = 0;
line("#pragma GCC diagnostic push");
line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
line();
line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {");
indent([&]() {
@ -224,12 +302,12 @@ class CCodeGenerator {
line("[" + to_string(state_id++) + "] = {");
indent([&]() {
for (const auto &entry : state.nonterminal_entries) {
line("[" + symbol_id(rules::Symbol(entry.first)) + "] = STATE(");
line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE(");
add(to_string(entry.second));
add("),");
}
for (const auto &entry : state.terminal_entries) {
line("[" + symbol_id(rules::Symbol(entry.first, true)) + "] = ACTIONS(");
line("[" + symbol_id(entry.first) + "] = ACTIONS(");
add(to_string(add_parse_action_list_id(entry.second)));
add("),");
}
@ -242,12 +320,37 @@ class CCodeGenerator {
line();
add_parse_action_list();
line();
line("#pragma GCC diagnostic pop");
line();
}
void add_parser_export() {
line("EXPORT_LANGUAGE(ts_language_" + name + ");");
if (!syntax_grammar.external_tokens.empty()) {
string external_scanner_name = "ts_language_" + name + "_external_scanner";
line("void *" + external_scanner_name + "_create();");
line("bool " + external_scanner_name + "_scan();");
line("void " + external_scanner_name + "_destroy();");
line();
line("const TSLanguage *ts_language_" + name + "() {");
indent([&]() {
if (!syntax_grammar.external_tokens.empty()) {
line("GET_LANGUAGE(");
indent([&]() {
line(external_scanner_name + "_create,");
line(external_scanner_name + "_scan,");
line(external_scanner_name + "_destroy,");
});
line(");");
}
});
line("}");
} else {
line("const TSLanguage *ts_language_" + name + "() {");
indent([&]() {
line("GET_LANGUAGE();");
});
line("}");
}
line();
}
@ -379,22 +482,13 @@ class CCodeGenerator {
return result;
}
size_t add_in_progress_symbol_list_id(const set<rules::Symbol> &symbols) {
for (const auto &pair : in_progress_symbols) {
if (pair.second == symbols) {
return pair.first;
}
}
size_t result = next_in_progress_symbol_list_index;
in_progress_symbols.push_back({ result, symbols });
next_in_progress_symbol_list_index += 1 + symbols.size();
return result;
}
// Helper functions
string symbol_id(const rules::Symbol &symbol) {
string external_token_id(Symbol::Index index) {
return "ts_external_token_" + syntax_grammar.external_tokens[index];
}
string symbol_id(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return "ts_builtin_sym_end";
@ -411,25 +505,31 @@ class CCodeGenerator {
}
}
string symbol_name(const rules::Symbol &symbol) {
string symbol_name(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return "END";
return entry_for_symbol(symbol).first;
}
VariableType symbol_type(const rules::Symbol &symbol) {
VariableType symbol_type(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return VariableTypeHidden;
return entry_for_symbol(symbol).second;
}
pair<string, VariableType> entry_for_symbol(const rules::Symbol &symbol) {
if (symbol.is_token) {
const Variable &variable = lexical_grammar.variables[symbol.index];
return { variable.name, variable.type };
} else {
const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
return { variable.name, variable.type };
pair<string, VariableType> entry_for_symbol(const Symbol &symbol) {
switch (symbol.type) {
case Symbol::NonTerminal: {
const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
case Symbol::External: {
return { syntax_grammar.external_tokens[symbol.index], VariableTypeAnonymous };
}
}
}

View file

@ -12,6 +12,7 @@ struct Grammar {
std::vector<std::pair<std::string, rule_ptr>> rules;
std::vector<rule_ptr> extra_tokens;
std::vector<std::vector<std::string>> expected_conflicts;
std::vector<std::string> external_tokens;
};
} // namespace tree_sitter

View file

@ -119,6 +119,16 @@ ParseRuleResult parse_rule(json_value *rule_json) {
}
}
if (type == "EXTERNAL_TOKEN") {
json_value token_name_json = rule_json->operator[]("name");
if (token_name_json.type != json_string) {
error_message = "External token name must be a string";
goto error;
}
return { external_token(token_name_json.u.string.ptr), "" };
}
if (type == "PATTERN") {
json_value value_json = rule_json->operator[]("value");
if (value_json.type == json_string) {
@ -210,7 +220,7 @@ ParseGrammarResult parse_grammar(const string &input) {
string error_message;
string name;
Grammar grammar;
json_value name_json, rules_json, extras_json, conflicts_json;
json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json;
json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 };
char parse_error[json_error_max];
@ -302,6 +312,25 @@ ParseGrammarResult parse_grammar(const string &input) {
}
}
external_tokens_json = grammar_json->operator[]("externals");
if (external_tokens_json.type != json_none) {
if (external_tokens_json.type != json_array) {
error_message = "External tokens must be an array";
goto error;
}
for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) {
json_value *token_name_json = external_tokens_json.u.array.values[i];
if (token_name_json->type != json_string) {
error_message = "External token values must be strings";
goto error;
}
string token_name = token_name_json->u.string.ptr;
grammar.external_tokens.push_back(token_name);
}
}
json_value_free(grammar_json);
return { name, grammar, "" };

View file

@ -1,6 +1,7 @@
#include "compiler/parse_table.h"
#include <string>
#include "compiler/precedence_range.h"
#include "compiler/rules/built_in_symbols.h"
namespace tree_sitter {
@ -28,7 +29,7 @@ ParseAction::ParseAction()
extra(false),
fragile(false),
state_index(-1),
symbol(Symbol(-1)),
symbol(rules::NONE()),
consumed_symbol_count(0),
production(nullptr) {}
@ -43,11 +44,11 @@ ParseAction ParseAction::Accept() {
}
ParseAction ParseAction::Shift(ParseStateId state_index) {
return ParseAction(ParseActionTypeShift, state_index, Symbol(-1), 0, nullptr);
return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr);
}
ParseAction ParseAction::Recover(ParseStateId state_index) {
return ParseAction(ParseActionTypeRecover, state_index, Symbol(-1), 0,
return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0,
nullptr);
}
@ -150,9 +151,7 @@ bool ParseState::has_shift_action() const {
set<Symbol> ParseState::expected_inputs() const {
set<Symbol> result;
for (auto &entry : terminal_entries)
result.insert(Symbol(entry.first, true));
for (auto &entry : nonterminal_entries)
result.insert(Symbol(entry.first, false));
result.insert(entry.first);
return result;
}
@ -182,33 +181,24 @@ ParseStateId ParseTable::add_state() {
return states.size() - 1;
}
ParseAction &ParseTable::set_terminal_action(ParseStateId state_id,
Symbol::Index index,
ParseAction action) {
states[state_id].terminal_entries[index].actions.clear();
return add_terminal_action(state_id, index, action);
}
ParseAction &ParseTable::add_terminal_action(ParseStateId state_id,
Symbol::Index index,
Symbol lookahead,
ParseAction action) {
Symbol symbol(index, true);
if (action.type == ParseActionTypeShift && action.extra)
symbols[symbol].extra = true;
symbols[lookahead].extra = true;
else
symbols[symbol].structural = true;
symbols[lookahead].structural = true;
ParseTableEntry &entry = states[state_id].terminal_entries[index];
ParseTableEntry &entry = states[state_id].terminal_entries[lookahead];
entry.actions.push_back(action);
return *entry.actions.rbegin();
}
void ParseTable::set_nonterminal_action(ParseStateId state_id,
Symbol::Index index,
Symbol::Index lookahead,
ParseStateId next_state_id) {
Symbol symbol(index, false);
symbols[symbol].structural = true;
states[state_id].nonterminal_entries[index] = next_state_id;
symbols[Symbol(lookahead, Symbol::NonTerminal)].structural = true;
states[state_id].nonterminal_entries[lookahead] = next_state_id;
}
static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
@ -226,12 +216,12 @@ bool ParseTable::merge_state(size_t i, size_t j) {
return false;
for (auto &entry : state.terminal_entries) {
Symbol::Index index = entry.first;
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
const auto &other_entry = other.terminal_entries.find(index);
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index))
if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
@ -242,25 +232,25 @@ bool ParseTable::merge_state(size_t i, size_t j) {
}
}
set<Symbol::Index> symbols_to_merge;
set<Symbol> symbols_to_merge;
for (auto &entry : other.terminal_entries) {
Symbol::Index index = entry.first;
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
if (!state.terminal_entries.count(index)) {
if (mergeable_symbols.count(index) == 0 && !Symbol::is_built_in(index))
if (!state.terminal_entries.count(lookahead)) {
if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(state, entry.second))
return false;
symbols_to_merge.insert(index);
symbols_to_merge.insert(lookahead);
}
}
for (const Symbol::Index &index : symbols_to_merge)
state.terminal_entries[index] = other.terminal_entries.find(index)->second;
for (const Symbol &lookahead : symbols_to_merge)
state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
return true;
}

View file

@ -76,7 +76,7 @@ class ParseState {
void each_referenced_state(std::function<void(ParseStateId *)>);
bool has_shift_action() const;
std::map<rules::Symbol::Index, ParseTableEntry> terminal_entries;
std::map<rules::Symbol, ParseTableEntry> terminal_entries;
std::map<rules::Symbol::Index, ParseStateId> nonterminal_entries;
LexStateId lex_state_id;
size_t shift_actions_signature;
@ -91,15 +91,14 @@ class ParseTable {
public:
std::set<rules::Symbol> all_symbols() const;
ParseStateId add_state();
ParseAction &add_terminal_action(ParseStateId state_id, int, ParseAction);
ParseAction &set_terminal_action(ParseStateId state_id, int index, ParseAction);
void set_nonterminal_action(ParseStateId state_id, int index, ParseStateId);
ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
bool merge_state(size_t i, size_t j);
std::vector<ParseState> states;
std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
std::set<rules::Symbol::Index> mergeable_symbols;
std::set<rules::Symbol> mergeable_symbols;
};
} // namespace tree_sitter

View file

@ -39,7 +39,7 @@ class ExpandRepeats : public rules::IdentityRuleFn {
rule_ptr inner_rule = apply(rule->content);
size_t index = aux_rules.size();
string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count);
Symbol repeat_symbol(offset + index);
Symbol repeat_symbol(offset + index, Symbol::NonTerminal);
existing_repeats.push_back({ rule->copy(), repeat_symbol });
aux_rules.push_back(
Variable(helper_rule_name, VariableTypeAuxiliary,
@ -65,6 +65,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
result.variables = grammar.variables;
result.extra_tokens = grammar.extra_tokens;
result.expected_conflicts = grammar.expected_conflicts;
result.external_tokens = grammar.external_tokens;
ExpandRepeats expander(result.variables.size());
for (auto &variable : result.variables)

View file

@ -11,6 +11,7 @@
#include "compiler/rules/symbol.h"
#include "compiler/rules/string.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/external_token.h"
#include "compiler/rules/pattern.h"
#include "compiler/prepare_grammar/token_description.h"
#include "compiler/prepare_grammar/is_token.h"
@ -38,7 +39,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
map<Symbol, Symbol> replacements;
Symbol replace_symbol(const Symbol &symbol) {
if (symbol.is_built_in() || symbol.is_token)
if (!symbol.is_non_terminal())
return symbol;
auto replacement_pair = replacements.find(symbol);
@ -49,7 +50,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
for (const auto &pair : replacements)
if (pair.first.index < symbol.index)
new_index--;
return Symbol(new_index);
return Symbol(new_index, Symbol::NonTerminal);
}
};
@ -60,14 +61,14 @@ class TokenExtractor : public rules::IdentityRuleFn {
for (size_t i = 0; i < tokens.size(); i++)
if (tokens[i].rule->operator==(*input)) {
token_usage_counts[i]++;
return make_shared<Symbol>(i, true);
return make_shared<Symbol>(i, Symbol::Terminal);
}
rule_ptr rule = input->copy();
size_t index = tokens.size();
tokens.push_back(Variable(token_description(rule), entry_type, rule));
token_usage_counts.push_back(1);
return make_shared<Symbol>(index, true);
return make_shared<Symbol>(index, Symbol::Terminal);
}
rule_ptr apply_to(const rules::String *rule) {
@ -78,6 +79,10 @@ class TokenExtractor : public rules::IdentityRuleFn {
return apply_to_token(rule, VariableTypeAuxiliary);
}
rule_ptr apply_to(const rules::ExternalToken *rule) {
return apply_to_token(rule, VariableTypeAuxiliary);
}
rule_ptr apply_to(const rules::Metadata *rule) {
if (rule->params.is_token)
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
@ -90,7 +95,7 @@ class TokenExtractor : public rules::IdentityRuleFn {
vector<Variable> tokens;
};
static CompileError ubiq_token_err(const string &message) {
static CompileError extra_token_error(const string &message) {
return CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
"Not a token: " + message);
}
@ -122,11 +127,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
size_t i = 0;
for (const Variable &variable : processed_variables) {
auto symbol = variable.rule->as<Symbol>();
if (symbol && symbol->is_token && !symbol->is_built_in() &&
extractor.token_usage_counts[symbol->index] == 1) {
if (symbol && symbol->is_token() && extractor.token_usage_counts[symbol->index] == 1) {
lexical_grammar.variables[symbol->index].type = variable.type;
lexical_grammar.variables[symbol->index].name = variable.name;
symbol_replacer.replacements.insert({ Symbol(i), *symbol });
symbol_replacer.replacements.insert({ Symbol(i, Symbol::NonTerminal), *symbol });
} else {
syntax_grammar.variables.push_back(variable);
}
@ -158,7 +162,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
bool used_elsewhere_in_grammar = false;
for (const Variable &variable : lexical_grammar.variables) {
if (variable.rule->operator==(*rule)) {
syntax_grammar.extra_tokens.insert(Symbol(i, true));
syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
used_elsewhere_in_grammar = true;
}
i++;
@ -175,17 +179,20 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
auto symbol = rule->as<Symbol>();
if (!symbol)
return make_tuple(syntax_grammar, lexical_grammar,
ubiq_token_err(rule->to_string()));
extra_token_error(rule->to_string()));
Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
if (!new_symbol.is_token)
if (!new_symbol.is_token()) {
return make_tuple(
syntax_grammar, lexical_grammar,
ubiq_token_err(syntax_grammar.variables[new_symbol.index].name));
extra_token_error(syntax_grammar.variables[new_symbol.index].name));
}
syntax_grammar.extra_tokens.insert(new_symbol);
}
syntax_grammar.external_tokens = grammar.external_tokens;
return make_tuple(syntax_grammar, lexical_grammar, CompileError::none());
}

View file

@ -92,6 +92,7 @@ pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &gr
SyntaxGrammar result;
result.expected_conflicts = grammar.expected_conflicts;
result.extra_tokens = grammar.extra_tokens;
result.external_tokens = grammar.external_tokens;
bool is_start = true;
for (const Variable &variable : grammar.variables) {

View file

@ -1,13 +1,12 @@
#ifndef COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_
#define COMPILER_PREPARE_GRAMMAR_INITIAL_SYNTAX_GRAMMAR_H_
#include <vector>
#include <string>
#include <set>
#include <vector>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
#include "compiler/variable.h"
#include "compiler/syntax_grammar.h"
#include "compiler/variable.h"
namespace tree_sitter {
namespace prepare_grammar {
@ -16,6 +15,7 @@ struct InitialSyntaxGrammar {
std::vector<Variable> variables;
std::set<rules::Symbol> extra_tokens;
std::set<ConflictSet> expected_conflicts;
std::vector<std::string> external_tokens;
};
} // namespace prepare_grammar

View file

@ -7,6 +7,7 @@
#include "compiler/rules/visitor.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/named_symbol.h"
#include "compiler/rules/external_token.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {
@ -17,6 +18,7 @@ using std::vector;
using std::set;
using std::pair;
using std::make_shared;
using rules::Symbol;
class InternSymbols : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
@ -30,17 +32,34 @@ class InternSymbols : public rules::IdentityRuleFn {
return result;
}
rule_ptr apply_to(const rules::ExternalToken *rule) {
auto result = symbol_for_external_token(rule->name);
if (!result.get()) {
missing_external_token_name = rule->name;
return rules::Blank::build();
}
return result;
}
public:
std::shared_ptr<rules::Symbol> symbol_for_rule_name(string rule_name) {
for (size_t i = 0; i < grammar.rules.size(); i++)
if (grammar.rules[i].first == rule_name)
return make_shared<rules::Symbol>(i);
return make_shared<Symbol>(i, Symbol::NonTerminal);
return nullptr;
}
std::shared_ptr<rules::Symbol> symbol_for_external_token(string name) {
for (size_t i = 0; i < grammar.external_tokens.size(); i++)
if (grammar.external_tokens[i] == name)
return make_shared<rules::Symbol>(i, Symbol::External);
return nullptr;
}
explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {}
const Grammar grammar;
string missing_rule_name;
string missing_external_token_name;
};
CompileError missing_rule_error(string rule_name) {
@ -48,14 +67,22 @@ CompileError missing_rule_error(string rule_name) {
"Undefined rule '" + rule_name + "'");
}
CompileError missing_external_token_error(string token_name) {
return CompileError(TSCompileErrorTypeUndefinedSymbol,
"Undefined external token '" + token_name + "'");
}
pair<InternedGrammar, CompileError> intern_symbols(const Grammar &grammar) {
InternedGrammar result;
result.external_tokens = grammar.external_tokens;
InternSymbols interner(grammar);
for (auto &pair : grammar.rules) {
auto new_rule = interner.apply(pair.second);
if (!interner.missing_rule_name.empty())
return { result, missing_rule_error(interner.missing_rule_name) };
if (!interner.missing_external_token_name.empty())
return { result, missing_external_token_error(interner.missing_external_token_name) };
result.variables.push_back(Variable(
pair.first, pair.first[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
@ -66,6 +93,8 @@ pair<InternedGrammar, CompileError> intern_symbols(const Grammar &grammar) {
auto new_rule = interner.apply(rule);
if (!interner.missing_rule_name.empty())
return { result, missing_rule_error(interner.missing_rule_name) };
if (!interner.missing_external_token_name.empty())
return { result, missing_external_token_error(interner.missing_external_token_name) };
result.extra_tokens.push_back(new_rule);
}

View file

@ -15,6 +15,7 @@ struct InternedGrammar {
std::vector<Variable> variables;
std::vector<rule_ptr> extra_tokens;
std::set<ConflictSet> expected_conflicts;
std::vector<std::string> external_tokens;
};
} // namespace prepare_grammar

View file

@ -22,6 +22,7 @@ rule_ptr prec_left(int precedence, const rule_ptr &);
rule_ptr prec_right(const rule_ptr &);
rule_ptr prec_right(int precedence, const rule_ptr &);
rule_ptr token(const rule_ptr &rule);
rule_ptr external_token(const std::string &);
} // namespace std

View file

@ -4,15 +4,15 @@ namespace tree_sitter {
namespace rules {
Symbol END_OF_INPUT() {
return Symbol(-1, true);
return Symbol(-1, Symbol::Terminal);
}
Symbol START() {
return Symbol(-2);
return Symbol(-2, Symbol::NonTerminal);
}
Symbol NONE() {
return Symbol(-3);
return Symbol(-3, Symbol::NonTerminal);
}
} // namespace rules

View file

@ -0,0 +1,39 @@
#include "compiler/rules/external_token.h"
#include <string>
#include "compiler/rules/visitor.h"
namespace tree_sitter {
namespace rules {
using std::string;
using std::hash;
ExternalToken::ExternalToken(const string &name) : name(name) {}
rule_ptr ExternalToken::build(const string &name) {
return std::make_shared<ExternalToken>(name);
}
bool ExternalToken::operator==(const Rule &rule) const {
auto other = rule.as<ExternalToken>();
return other && other->name == name;
}
size_t ExternalToken::hash_code() const {
return hash<string>()(name);
}
rule_ptr ExternalToken::copy() const {
return std::make_shared<ExternalToken>(*this);
}
string ExternalToken::to_string() const {
return string("(sym '") + name + "')";
}
void ExternalToken::accept(Visitor *visitor) const {
visitor->visit(this);
}
} // namespace rules
} // namespace tree_sitter

View file

@ -0,0 +1,27 @@
#ifndef COMPILER_RULES_EXTERNAL_TOKEN_H_
#define COMPILER_RULES_EXTERNAL_TOKEN_H_
#include <string>
#include "compiler/rule.h"
namespace tree_sitter {
namespace rules {
class ExternalToken : public Rule {
public:
explicit ExternalToken(const std::string &);
static rule_ptr build(const std::string &);
bool operator==(const Rule &other) const;
size_t hash_code() const;
rule_ptr copy() const;
std::string to_string() const;
void accept(Visitor *visitor) const;
std::string name;
};
} // namespace rules
} // namespace tree_sitter
#endif // COMPILER_RULES_EXTERNAL_TOKEN_H_

View file

@ -13,6 +13,7 @@
#include "compiler/rules/pattern.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/external_token.h"
#include "compiler/rules/built_in_symbols.h"
namespace tree_sitter {
@ -105,4 +106,8 @@ rule_ptr token(const rule_ptr &rule) {
return metadata(rule, params);
}
rule_ptr external_token(const string &name) {
return rules::ExternalToken::build(name);
}
} // namespace tree_sitter

View file

@ -11,12 +11,10 @@ using std::string;
using std::to_string;
using util::hash_combine;
Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {}
Symbol::Symbol(Symbol::Index index, bool is_token) : index(index), is_token(is_token) {}
Symbol::Symbol(Symbol::Index index, Symbol::Type type) : index(index), type(type) {}
bool Symbol::operator==(const Symbol &other) const {
return (other.index == index) && (other.is_token == is_token);
return (other.index == index) && (other.type == type);
}
bool Symbol::operator==(const Rule &rule) const {
@ -27,7 +25,7 @@ bool Symbol::operator==(const Rule &rule) const {
size_t Symbol::hash_code() const {
size_t result = 0;
hash_combine(&result, index);
hash_combine(&result, is_token);
hash_combine<int>(&result, type);
return result;
}
@ -36,14 +34,20 @@ rule_ptr Symbol::copy() const {
}
string Symbol::to_string() const {
string name = is_token ? "token" : "sym";
return "(" + name + " " + std::to_string(index) + ")";
switch (type) {
case Symbol::Terminal:
return "(terminal " + std::to_string(index) + ")";
case Symbol::NonTerminal:
return "(non-terminal " + std::to_string(index) + ")";
case Symbol::External:
return "(external " + std::to_string(index) + ")";
}
}
bool Symbol::operator<(const Symbol &other) const {
if (is_token && !other.is_token)
if (type < other.type)
return true;
if (!is_token && other.is_token)
if (other.type < type)
return false;
return (index < other.index);
}
@ -56,6 +60,18 @@ bool Symbol::is_built_in() const {
return is_built_in(index);
}
bool Symbol::is_token() const {
return type == Symbol::Terminal;
}
bool Symbol::is_external() const {
return type == Symbol::External;
}
bool Symbol::is_non_terminal() const {
return type == Symbol::NonTerminal;
}
void Symbol::accept(Visitor *visitor) const {
visitor->visit(this);
}

View file

@ -11,9 +11,13 @@ class Symbol : public Rule {
public:
typedef int Index;
typedef enum {
Terminal,
NonTerminal,
External,
} Type;
explicit Symbol(Index index);
Symbol(Index index, bool is_token);
Symbol(Index index, Type type);
bool operator==(const Symbol &other) const;
bool operator==(const Rule &other) const;
@ -26,9 +30,12 @@ class Symbol : public Rule {
bool operator<(const Symbol &other) const;
static bool is_built_in(Index);
bool is_built_in() const;
bool is_token() const;
bool is_external() const;
bool is_non_terminal() const;
Index index;
bool is_token;
Type type;
};
} // namespace rules

View file

@ -16,6 +16,7 @@ class String;
class Symbol;
class Pattern;
class Metadata;
class ExternalToken;
class Visitor {
public:
@ -29,6 +30,7 @@ class Visitor {
virtual void visit(const String *rule) = 0;
virtual void visit(const NamedSymbol *rule) = 0;
virtual void visit(const Symbol *rule) = 0;
virtual void visit(const ExternalToken *rule) = 0;
virtual ~Visitor();
};
@ -86,6 +88,10 @@ class RuleFn : private Visitor {
return default_apply((const Rule *)rule);
}
virtual T apply_to(const ExternalToken *rule) {
return default_apply((const Rule *)rule);
}
void visit(const Blank *rule) {
value_ = apply_to(rule);
}
@ -126,6 +132,10 @@ class RuleFn : private Visitor {
value_ = apply_to(rule);
}
void visit(const ExternalToken *rule) {
value_ = apply_to(rule);
}
private:
T value_;
};
@ -170,6 +180,9 @@ class RuleFn<void> : private Visitor {
virtual void apply_to(const Symbol *rule) {
return default_apply((const Rule *)rule);
}
virtual void apply_to(const ExternalToken *rule) {
return default_apply((const Rule *)rule);
}
void visit(const Blank *rule) {
apply_to(rule);
@ -201,6 +214,9 @@ class RuleFn<void> : private Visitor {
void visit(const Symbol *rule) {
apply_to(rule);
}
void visit(const ExternalToken *rule) {
apply_to(rule);
}
};
class IdentityRuleFn : public RuleFn<rule_ptr> {

View file

@ -13,8 +13,6 @@ using std::pair;
using std::vector;
using std::set;
static const vector<Production> NO_PRODUCTIONS;
SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
const vector<Production> &productions)
: name(name), productions(productions), type(type) {}
@ -28,13 +26,4 @@ bool ProductionStep::operator==(const ProductionStep &other) const {
associativity == other.associativity;
}
const vector<Production> &SyntaxGrammar::productions(
const rules::Symbol &symbol) const {
if (symbol.is_built_in() || symbol.is_token) {
return NO_PRODUCTIONS;
} else {
return variables[symbol.index].productions;
}
}
} // namespace tree_sitter

View file

@ -33,11 +33,10 @@ struct SyntaxVariable {
typedef std::set<rules::Symbol> ConflictSet;
struct SyntaxGrammar {
const std::vector<Production> &productions(const rules::Symbol &) const;
std::vector<SyntaxVariable> variables;
std::set<rules::Symbol> extra_tokens;
std::set<ConflictSet> expected_conflicts;
std::vector<std::string> external_tokens;
};
} // namespace tree_sitter

View file

@ -161,7 +161,7 @@ static void parser__pop_reusable_node_leaf(ReusableNode *reusable_node) {
static bool parser__can_reuse(Parser *self, TSStateId state, Tree *tree,
TableEntry *table_entry) {
if (tree->first_leaf.lex_state == self->language->lex_states[state])
if (tree->first_leaf.lex_state == self->language->lex_modes[state].lex_state)
return true;
if (!table_entry->is_reusable)
return false;
@ -209,7 +209,7 @@ static bool parser__condense_stack(Parser *self) {
}
static Tree *parser__lex(Parser *self, TSStateId parse_state) {
TSStateId start_state = self->language->lex_states[parse_state];
TSStateId start_state = self->language->lex_modes[parse_state].lex_state;
TSStateId current_state = start_state;
Length start_position = self->lexer.current_position;
LOG("lex state:%d", start_state);
@ -729,6 +729,9 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) {
LOG("new_parse");
}
if (self->language->external_scanner.create)
self->language->external_scanner.create();
ts_lexer_set_input(&self->lexer, input);
ts_stack_clear(self->stack);
self->reusable_node = (ReusableNode){ previous_tree, 0 };