用icu探测字符集

g++ charset.cpp -licui18n -licuuc

#include <iostream>
#include <stdio.h>
#include <string.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
//#include <unicode/urename.h>
#include <unicode/ucsdet.h>

bool what_charset(const char *data, int len, char **detected)
{
UCharsetDetector* csd;
const UCharsetMatch **csm;
int match, matchCount = 0;
UErrorCode status = U_ZERO_ERROR;
csd = ucsdet_open(&status);
if(status != U_ZERO_ERROR)
return false;
ucsdet_setText(csd, data, len, &status);
if(status != U_ZERO_ERROR)
return false;

csm = ucsdet_detectAll(csd, &matchCount, &status);
if(status != U_ZERO_ERROR)
return false;
#if 1 //打印出探测的可能的编码
for(match = 0; match < matchCount; match += 1)
{
const char *name = ucsdet_getName(csm[match], &status);
const char *lang = ucsdet_getLanguage(csm[match], &status);
int confidence = ucsdet_getConfidence(csm[match], &status);
if (lang == NULL || strlen(lang) == 0)
lang = "**";
printf("%s (%s) %d\n", name, lang, confidence);
}
#endif
if(matchCount > 0)
{
*detected = strdup(ucsdet_getName(csm[0], &status)); //分配了内
if(status != U_ZERO_ERROR)
return false;
}

printf("charset = %s n", *detected);
ucsdet_close(csd);

return true;
}

int main(int argc, char* argv[])
{

std::string data = "This is a 测试用数据";
char buf[128];
char *str[1];
str[0] = buf;
int convert_flag = 0;
int subject_length = data.length();

what_charset((const char *)data.c_str(), data.length(), str);
std::string encoding = str[0];
if(encoding != "UTF-8")
{
int clen = 0;
convert_flag = 1;
UErrorCode error = U_ZERO_ERROR;
char *u8_data = (char *)malloc(subject_length * 2);
if(u8_data == NULL)
{
return -1;
}
clen = ucnv_convert("UTF-8", (char *)encoding.c_str(), (char *)u8_data, subject_length*2, (char *)data.c_str(), subject_length, &error);
if(error != U_ZERO_ERROR)
{
free((void *)u8_data);
return -1;
}
}

return 0;
}

posted on 2023-12-13 14:57 北京开发阅读(80) 评论(0) 收藏举报

刷新页面返回顶部

用icu探测字符集

公告