【踩坑记录】tree-sitter编写parser,external scanner 规则无法处理空字符

问题描述

用external scanner编写的规则square_startsquare_end,标志方括号的起始和结束(中途允许方括号,但是数量要匹配)。

enum TokenType 
{ 
  SquareStart,
  SquareEnd,
  SquareLeft,
  SquareRight,
};

typedef struct BracketState {
  uint32_t squareCount;
  uint32_t parenStart;
} BracketState;

static bool scan_square_start(TSLexer *lexer, BracketState *state);
static bool scan_square_end(TSLexer *lexer, BracketState *state);
static bool scan_square_left(TSLexer *lexer, BracketState *state);
static bool scan_square_right(TSLexer *lexer, BracketState *state);

void *tree_sitter_cangjie_external_scanner_create(void) {
  BracketState* state = (BracketState*)ts_malloc(sizeof(BracketState));
  state->squareCount = 0;
  state->parenStart = 0;
  return state;
}

void tree_sitter_cangjie_external_scanner_destroy(void *payload) {
  ts_free(payload);
}

unsigned tree_sitter_cangjie_external_scanner_serialize(void *payload,
                                                        char *buffer) {
  if (payload == NULL || buffer == NULL) {
    return 0; // Invalid arguments
  }

  memcpy(buffer, payload, sizeof(BracketState));

  // Return the total number of bytes written
  return sizeof(BracketState);
}

void tree_sitter_cangjie_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
  if (length < sizeof(BracketState)) {
    return; // Invalid arguments or insufficient data
  }
  memcpy(payload, buffer, sizeof(BracketState));
}

bool tree_sitter_cangjie_external_scanner_scan(void *payload, TSLexer *lexer,
                                               const bool *valid_symbols) {
  BracketState *state = (BracketState *)payload;
  
  if (valid_symbols[SquareStart]) {
    return scan_square_start(lexer, state);
  } else if (valid_symbols[SquareEnd]) {
    return scan_square_end(lexer, state);
  } else if (valid_symbols[SquareLeft]) {
    return scan_square_left(lexer, state);
  } else if (valid_symbols[SquareRight]) {
    return scan_square_right(lexer, state);
  } 
  return false;
}

bool scan_square_start(TSLexer *lexer, BracketState *state) {
  if (lexer->lookahead != '[' || state->squareCount != 0) {
    return false;
  } else {
    lexer->advance(lexer, false);
    lexer->result_symbol = SquareStart;
    state->squareCount = 1;
    return true;
  }
}

bool scan_square_end(TSLexer *lexer, BracketState *state) {
  if (lexer->lookahead != ']' || state->squareCount != 1) {
    return false;
  } else {
    lexer->advance(lexer, false);
    lexer->result_symbol = SquareEnd;
    state->squareCount = 0;
    return true;
  }
}

bool scan_square_left(TSLexer *lexer, BracketState *state) {
  if (lexer->lookahead != '[' && lexer->lookahead != '\\') {
    return false;
  } else if (lexer->lookahead == '[') {
    lexer->advance(lexer, false);
    lexer->result_symbol = SquareLeft;
    state->squareCount++;
    return true;
  } else {
    lexer->advance(lexer, true);
    if (lexer->lookahead == '[') {
      lexer->advance(lexer, false);
      lexer->result_symbol = SquareLeft;
      return true;
    }
    return false;
  }
}

bool scan_square_right(TSLexer *lexer, BracketState *state) {
  if (lexer->lookahead != ']' && lexer->lookahead != '\\') {
    return false;
  } else if (lexer->lookahead == ']') {
    lexer->advance(lexer, false);
    lexer->result_symbol = SquareRight;
    state->squareCount--;
    return true;
  } else {
    lexer->advance(lexer, true);
    if (lexer->lookahead == ']') {
      lexer->advance(lexer, false);
      lexer->result_symbol = SquareRight;
      return true;
    }
    return false;
  }
}

测试一个简单的规则:

source_file: $ => seq(
	$.square_start, $.square_end
)

[]可以匹配,但是[ ]却匹配不成功

问题解决

我在extras中添加了:

extras: $ => [
	/[ \t\v\f]+/,
],

但是external scanner不能自动处理上述规则,需要手动处理,在tree_sitter_your_language_external_scanner_scan函数的开头添加下面的代码:

while(lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\f' ||lexer->lookahead == '\v') {
	lexer->advance(lexer, false);
}

再次运行结果正常,问题解决✓

posted @ 2024-12-27 17:20  laditor  阅读(0)  评论(0编辑  收藏  举报