ucs2-utf8 (Convert characters set beetween utf-8 and ucs2 ONLY for simply chinese, ascii ,littlen-endian Computer.)
2021-09-07 19:11 dreamboy2000 阅读(143) 评论(0) 编辑 收藏 举报#include <stdio.h> typedef unsigned short ucs2; //Unicode2 16bits. typedef unsigned char utf8; // utf-8 8bits. /* * * 一:e4 b8 80 (utf-8) * 1110 0000 1011 1000 1000 0000 * * * e0 (ucs2) * 0000 0000 11 1000 00 0000 * *--------------------------------------------------- * ucs2 : UTF8 * u16 1 Bytes 0xxxxxxx * u16 2 Bytes 110xxxxx 10xxxxxx * u16 3 Bytes 1110xxxx 10xxxxxx 10xxxxxx */ /* * Convert UTF8-coded characters into Unicode2-coded. * * @Description: * * * @Params * dst : A pointer to indicate the memmory address which was used to store Unicode2-coded characters. * This functions never check args(dst, src), after Convertion dst ends with * zero. * src : A pointer to indicate the memmory address which will be converted. * Before invoke this functions, you should assure that two pointer is valid * and you have adequte memmory ,that is, the length of dst no less than src. * * @Return value: * It Always should be dst. * * see also ucs2utf. */ ucs2 *utf2ucs (ucs2 *dst, utf8 *src) { unsigned int i = 0; unsigned int j = 0; unsigned int next = 0; while (src[i] != 0) { ucs2 temp = 0; if (src[i] & 0x80 && src[i+1] & 0x80) { next = 3; temp |= ((src[i] & 0xF) << 12); temp |= ((src[i+1] & 0x3F) << 6); temp |= ((src[i+2] & 0x3F) << 0); } else if(src[i] & 0x80 && src[i+1] & 0xC0) { next = 2; temp |= (src[i] & 0x1F) << 6; temp |= (src[i+1] & 0x3F) << 0; } else { next = 1; temp = src[i]; } dst[j++] = temp; i += next; } dst[j] = 0; return dst; }; /* * * Convert Unicode2-coded characters into UTF8-coded. * * see also utf2ucs. * */ utf8 *ucs2utf (utf8 *dst, ucs2 *src) { unsigned int i = 0; unsigned int j = 0; unsigned int next = 0; while (src[i] != 0) { if (src[i] < 0x80) { next = 1; dst[j] = 0; dst[j] = src[i]; } else if(src[i] < 0x800) { next = 2; dst[j] = 0; dst[j+1] = 0; dst[j+1] = (utf8)((src[i] & 0x3F) | 0x80); dst[j] = (utf8)(((src[i] & 0x3F) & 0x1F) | 0xC0); } else { next = 3; dst[j] = 0; dst[j+1] = 0; dst[j+2] = 0; dst[j] |= ((((utf8)(src[i] >> 12)) & 0xF) | 0xE0); dst[j+1] |= (((utf8)(src[i] >> 6) & 0x3F) | 0x80); dst[j+2] |= (((utf8)(src[i] >> 0) & 0x3F) | 0x80); } j += next; i++; } dst[j] = 0; return dst; } /****************************************************/ //test code. int main(int argc,const char *argv[]) { utf8 str[1024]; ucs2 dst[512]; int i = 0; int j = 0; sprintf(str,"%s",argv[1]); utf2ucs(dst, str); ucs2utf(str, dst); printf("%s\n", str); return 0; }