Ubuntu下C++使用icu库检测字符编码
Ubuntu下C++使用icu库检测字符编码。需先安装libicu-dev库:
sudo apt install libicu-dev
C++代码如下:
//g++ -o x x.cpp -licuuc -licui18n #include <stdio.h> #include <string.h> #include <unicode/ucnv.h> #include <unicode/utypes.h> #include <unicode/ucsdet.h> #define BUF_MAX 4096 /* * data, 传入参数, 需要探测的字符串 * len, 传入参数, 探测字符串长度 * detected 传出参数, 探测的最有可能的字符编码名称, 调用者需要释放该字段 **/ bool detectTextEncoding(const char *data, int32_t len, char **detected) { UCharsetDetector *csd; const UCharsetMatch **csm; int32_t match, matchCount = 0; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); if (status != U_ZERO_ERROR) return false; ucsdet_setText(csd, data, len, &status); if (status != U_ZERO_ERROR) return false; csm = ucsdet_detectAll(csd, &matchCount, &status); if (status != U_ZERO_ERROR) return false; #if 0 //打印出探测的可能的编码 for(match = 0; match < matchCount; match += 1) { const char *name = ucsdet_getName(csm[match], &status); const char *lang = ucsdet_getLanguage(csm[match], &status); int32_t confidence = ucsdet_getConfidence(csm[match], &status); if (lang == NULL || strlen(lang) == 0) lang = "**"; printf("%s (%s) %d\n", name, lang, confidence); } #endif if (matchCount > 0) { *detected = strdup(ucsdet_getName(csm[0], &status)); //分配了内存, 需要释放 if (status != U_ZERO_ERROR) return false; } printf("charset = %s\n", *detected); ucsdet_close(csd); return true; } /* * toConverterName, 转换后的字符编码 * fromConverterName, 转换前的字符编码 * target, 存储转换后的字符串, 传出参数 * targetCapacity, 存储容量,target的大小 * source, 需要转换的字符串 * sourceLength, source的大小 **/ int convert(const char *toConverterName, const char *fromConverterName, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength) { UErrorCode error = U_ZERO_ERROR; ucnv_convert(toConverterName, fromConverterName, target, targetCapacity, source, sourceLength, &error); return error; } int main(int argc, char **argv) { if (argc <= 1) { printf("Usage: %s [filename]...\n", argv[0]); return -1; } FILE *file; char *filename = argv[1]; file = fopen(filename, "rb"); if (file == NULL) { printf("Cannot open file \"%s\"\n\n", filename); return -1; } int len = 0; char *detected = NULL; char *buffer = new char[BUF_MAX]; char *target = new char[BUF_MAX * 2]; while (true) { memset(buffer, 0, BUF_MAX); memset(target, 0, BUF_MAX * 2); len = (int32_t) fread(buffer, sizeof(char), BUF_MAX, file); if (detected == NULL) { if (!detectTextEncoding(buffer, len, &detected)) //编码探测 break; } //转换为utf8字符编码 if (convert("UTF-8", detected, target, BUF_MAX * 2, (const char *) buffer, len) != U_ZERO_ERROR) { printf("ucnv_convert error"); break; } printf("%s", target); //打印出转换的文件的字符串 if (len < BUF_MAX) break; } delete[] buffer; delete[] target; delete[] detected; fclose(file); return 0; }
测试一下,正常检测出了当前文件编码: