【踩坑记录】tree-sitter编写parser,external scanner 规则无法处理空字符
问题描述
用external scanner编写的规则square_start
和square_end
,标志方括号的起始和结束(中途允许方括号,但是数量要匹配)。
enum TokenType
{
SquareStart,
SquareEnd,
SquareLeft,
SquareRight,
};
typedef struct BracketState {
uint32_t squareCount;
uint32_t parenStart;
} BracketState;
static bool scan_square_start(TSLexer *lexer, BracketState *state);
static bool scan_square_end(TSLexer *lexer, BracketState *state);
static bool scan_square_left(TSLexer *lexer, BracketState *state);
static bool scan_square_right(TSLexer *lexer, BracketState *state);
void *tree_sitter_cangjie_external_scanner_create(void) {
BracketState* state = (BracketState*)ts_malloc(sizeof(BracketState));
state->squareCount = 0;
state->parenStart = 0;
return state;
}
void tree_sitter_cangjie_external_scanner_destroy(void *payload) {
ts_free(payload);
}
unsigned tree_sitter_cangjie_external_scanner_serialize(void *payload,
char *buffer) {
if (payload == NULL || buffer == NULL) {
return 0; // Invalid arguments
}
memcpy(buffer, payload, sizeof(BracketState));
// Return the total number of bytes written
return sizeof(BracketState);
}
void tree_sitter_cangjie_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
if (length < sizeof(BracketState)) {
return; // Invalid arguments or insufficient data
}
memcpy(payload, buffer, sizeof(BracketState));
}
bool tree_sitter_cangjie_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
BracketState *state = (BracketState *)payload;
if (valid_symbols[SquareStart]) {
return scan_square_start(lexer, state);
} else if (valid_symbols[SquareEnd]) {
return scan_square_end(lexer, state);
} else if (valid_symbols[SquareLeft]) {
return scan_square_left(lexer, state);
} else if (valid_symbols[SquareRight]) {
return scan_square_right(lexer, state);
}
return false;
}
bool scan_square_start(TSLexer *lexer, BracketState *state) {
if (lexer->lookahead != '[' || state->squareCount != 0) {
return false;
} else {
lexer->advance(lexer, false);
lexer->result_symbol = SquareStart;
state->squareCount = 1;
return true;
}
}
bool scan_square_end(TSLexer *lexer, BracketState *state) {
if (lexer->lookahead != ']' || state->squareCount != 1) {
return false;
} else {
lexer->advance(lexer, false);
lexer->result_symbol = SquareEnd;
state->squareCount = 0;
return true;
}
}
bool scan_square_left(TSLexer *lexer, BracketState *state) {
if (lexer->lookahead != '[' && lexer->lookahead != '\\') {
return false;
} else if (lexer->lookahead == '[') {
lexer->advance(lexer, false);
lexer->result_symbol = SquareLeft;
state->squareCount++;
return true;
} else {
lexer->advance(lexer, true);
if (lexer->lookahead == '[') {
lexer->advance(lexer, false);
lexer->result_symbol = SquareLeft;
return true;
}
return false;
}
}
bool scan_square_right(TSLexer *lexer, BracketState *state) {
if (lexer->lookahead != ']' && lexer->lookahead != '\\') {
return false;
} else if (lexer->lookahead == ']') {
lexer->advance(lexer, false);
lexer->result_symbol = SquareRight;
state->squareCount--;
return true;
} else {
lexer->advance(lexer, true);
if (lexer->lookahead == ']') {
lexer->advance(lexer, false);
lexer->result_symbol = SquareRight;
return true;
}
return false;
}
}
测试一个简单的规则:
source_file: $ => seq(
$.square_start, $.square_end
)
[]可以匹配,但是[ ]却匹配不成功
问题解决
我在extras
中添加了:
extras: $ => [
/[ \t\v\f]+/,
],
但是external scanner不能自动处理上述规则,需要手动处理,在tree_sitter_your_language_external_scanner_scan
函数的开头添加下面的代码:
while(lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\f' ||lexer->lookahead == '\v') {
lexer->advance(lexer, false);
}
再次运行结果正常,问题解决✓