iconv编码转换
当前测试pc的编码为:UTF-8
iconv_t iconv_open(const char *tocode, const char *fromcode);
size_t iconv(iconv_t cd,
char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft);
int iconv_close(iconv_t cd);
utf-8转gb2312
int i = 0;
char *inbuf = "王浡";
size_t inbytesleft = strlen(inbuf);
size_t outbytesleft = 3 * inbytesleft;
char *outbuf = (char*)malloc(outbytesleft);
memset(outbuf, 0, outbytesleft);
size_t bytes = outbytesleft;
char *ib = inbuf;
char *ob = outbuf;
//iconv_t cd = iconv_open("gbk", "utf-8");
iconv_t cd = iconv_open("gb2312", "utf-8");
int ret = iconv(cd, &ib, &inbytesleft, &ob, &outbytesleft);
if(ret < 0)
{
perror("iconv");
return -1;
}
bytes -= outbytesleft;
printf("inbuf: %s\n", inbuf);
printf("inbytesleft: %zu\n", strlen(inbuf));
printf("outbuf: ");
for(i = 0; i < bytes; i++)
{
printf("0x%x ", *(unsigned char*)&outbuf[i]);
}
printf("\n");
printf("outbytesleft: %zu\n", outbytesleft);
iconv_close(cd);
free(outbuf);
# ./a.out
iconv: Invalid or incomplete multibyte or wide character
GB2312编码适用于汉字处理、汉字通信等系统之间的信息交换,通行于中国大陆
GBK编码支持国际标准和国家标准中的全部中日韩汉字
将gb2312换成gbk即可
# ./a.out
inbuf: 王浡
inbytesleft: 6
outbuf: 0xcd, 0xf5, 0x9b, 0xc2,
outbytesleft: 14
注意
:iconv会将inbuf、outbuf地址改变,传参请小心
unicode转utf-8
int i = 0;
char inbuf[] = {0x8b, 0x73, 0x61, 0x6d};
size_t inbytesleft = sizeof(inbuf);
size_t outbytesleft = 3 * inbytesleft;
char *outbuf = (char*)malloc(outbytesleft);
memset(outbuf, 0, outbytesleft);
size_t bytes = outbytesleft;
char *ib = inbuf;
char *ob = outbuf;
iconv_t cd = iconv_open("utf-8", "unicode");
int ret = iconv(cd, &ib, &inbytesleft, &ob, &outbytesleft);
if(ret < 0)
{
perror("iconv");
return -1;
}
bytes -= outbytesleft;
printf("inbytesleft: %zu\n", strlen(inbuf));
printf("outbuf: %s\n\t", outbuf);
for(i = 0; i < bytes; i++)
{
printf("0x%x ", *(unsigned char*)&outbuf[i]);
}
printf("\n");
printf("outbytesleft: %zu\n", outbytesleft);
iconv_close(cd);
free(outbuf);
./a.out
inbytesleft: 4
outbuf: 王浡
0xe7, 0x8e, 0x8b, 0xe6, 0xb5, 0xa1,
outbytesleft: 6
转换过程:
unicode(16进制 2个字节) utf-8(2进制 3个字节)
0800 - FFFF 1110xxxx 10xxxxxx 10xxxxxx
王:11100111 10001110 10001011 --> 01110011 10001011 --> 0x73 0x8b
浡:11100110 10110101 10100001 --> 01101101 01100001 --> 0x6d 0x61
命令
Usage: iconv [OPTION...] [FILE...]
-f:输入编码
-t:输出编码
-l:列举所有已知的字符集
-c:输出中忽略无效字符
-o:输出文件
-s:关闭警告
# iconv -f GB2312 -t utf-8 text.gb2312 -o text.utf8