iconv编码转换

当前测试pc的编码为:UTF-8

iconv_t iconv_open(const char *tocode, const char *fromcode);
size_t iconv(iconv_t cd,
                    char **inbuf, size_t *inbytesleft,
                    char **outbuf, size_t *outbytesleft);
int iconv_close(iconv_t cd);

utf-8转gb2312

int i = 0;
char *inbuf = "王浡";
size_t inbytesleft = strlen(inbuf);
size_t outbytesleft = 3 * inbytesleft;
char *outbuf = (char*)malloc(outbytesleft);
memset(outbuf, 0, outbytesleft);
size_t bytes = outbytesleft;

char *ib = inbuf;
char *ob = outbuf;

//iconv_t cd = iconv_open("gbk", "utf-8");
iconv_t cd = iconv_open("gb2312", "utf-8");

int ret = iconv(cd, &ib, &inbytesleft, &ob, &outbytesleft);
if(ret < 0)
{
    perror("iconv");
    return -1;
}

bytes -= outbytesleft;

printf("inbuf: %s\n", inbuf);
printf("inbytesleft: %zu\n", strlen(inbuf));
printf("outbuf: ");
for(i = 0; i < bytes; i++)
{
    printf("0x%x ", *(unsigned char*)&outbuf[i]);
}
printf("\n");
printf("outbytesleft: %zu\n", outbytesleft);

iconv_close(cd);
free(outbuf);
# ./a.out 
iconv: Invalid or incomplete multibyte or wide character

GB2312编码适用于汉字处理、汉字通信等系统之间的信息交换,通行于中国大陆
GBK编码支持国际标准和国家标准中的全部中日韩汉字
将gb2312换成gbk即可

# ./a.out     
inbuf: 王浡
inbytesleft: 6
outbuf: 0xcd, 0xf5, 0x9b, 0xc2, 
outbytesleft: 14

注意:iconv会将inbuf、outbuf地址改变,传参请小心

unicode转utf-8

int i = 0;
char inbuf[] = {0x8b, 0x73, 0x61, 0x6d};
size_t inbytesleft = sizeof(inbuf);
size_t outbytesleft = 3 * inbytesleft;
char *outbuf = (char*)malloc(outbytesleft);
memset(outbuf, 0, outbytesleft);
size_t bytes = outbytesleft;

char *ib = inbuf;
char *ob = outbuf;

iconv_t cd = iconv_open("utf-8", "unicode");

int ret = iconv(cd, &ib, &inbytesleft, &ob, &outbytesleft);
if(ret < 0)
{
    perror("iconv");
    return -1;
}

bytes -= outbytesleft;

printf("inbytesleft: %zu\n", strlen(inbuf));
printf("outbuf: %s\n\t", outbuf);
for(i = 0; i < bytes; i++)
{
    printf("0x%x ", *(unsigned char*)&outbuf[i]);
}
printf("\n");
printf("outbytesleft: %zu\n", outbytesleft);

iconv_close(cd);
free(outbuf);
./a.out    
inbytesleft: 4
outbuf: 王浡
        0xe7, 0x8e, 0x8b, 0xe6, 0xb5, 0xa1, 
outbytesleft: 6

转换过程

unicode(16进制 2个字节)      utf-8(2进制 3个字节)
0800 - FFFF                 1110xxxx 10xxxxxx 10xxxxxx

王:11100111 10001110 10001011    --> 01110011 10001011   --> 0x73 0x8b
浡:11100110 10110101 10100001    --> 01101101 01100001   --> 0x6d 0x61

命令

Usage: iconv [OPTION...] [FILE...]

-f:输入编码
-t:输出编码
-l:列举所有已知的字符集
-c:输出中忽略无效字符
-o:输出文件
-s:关闭警告

# iconv -f GB2312 -t utf-8 text.gb2312 -o text.utf8

官网http://www.gnu.org/savannah-checkouts/gnu/libiconv/

posted @ 2017-05-26 17:46  thomas_blog  阅读(988)  评论(0编辑  收藏  举报