字符编码之UCS-2与Utf-8
很多操作系统都直接支持utf-8字符串操作,只有MS这个异类用的Unicode,就是所谓的ucs-2
如果写关于跨平台的代码,那么避免不了要做编码转化
这里贴一下今天写的把Unicode转化为Utf-8的代码
Ucs2BeToUcs2Le负责将大端转化为小端
Ucs2ToUtf8负责将Unicode转化为Utf-8
Utf8ToUcs2负责将Utf-8转化为Unicode
本转化函数只考虑了3个字节以下的编码,需要3个字节以上的同学请自行google了啊
1 // Convert Unicode big endian to Unicode little endian 2 unsigned Ucs2BeToUcs2Le(unsigned short *ucs2bige, unsigned int size) 3 { 4 printf("%s %d\n", __FUNCTION__, __LINE__); 5 6 if (!ucs2bige) { 7 return 0; 8 } 9 10 unsigned int length = size; 11 unsigned short *tmp = ucs2bige; 12 13 while (*tmp && length) { 14 15 length--; 16 unsigned char val_high = *tmp >> 8; 17 unsigned char val_low = (unsigned char)*tmp; 18 19 *tmp = val_low << 8 | val_high; 20 21 tmp++; 22 } 23 24 return size - length; 25 } 26 27 // Convert Ucs-2 to Utf-8 28 unsigned int Ucs2ToUtf8(unsigned short *ucs2, unsigned int ucs2_size, 29 unsigned char *utf8, unsigned int utf8_size) 30 { 31 unsigned int length = 0; 32 33 if (!ucs2) { 34 return 0; 35 } 36 37 unsigned short *inbuf = ucs2; 38 unsigned char *outbuf = utf8; 39 40 if (*inbuf == 0xFFFE) { 41 Ucs2BeToUcs2Le(inbuf, ucs2_size); 42 } 43 44 if (!utf8) { 45 unsigned int insize = ucs2_size; 46 47 while (*inbuf && insize) { 48 insize--; 49 50 /* if (*inbuf == 0xFEFF) { 51 inbuf++; 52 continue; 53 }*/ 54 55 if (0x0080 > *inbuf) { 56 length++; 57 } else if (0x0800 > *inbuf) { 58 length += 2; 59 } else { 60 length += 3; 61 } 62 63 inbuf++; 64 } 65 return length; 66 67 } else { 68 unsigned int insize = ucs2_size; 69 70 while (*inbuf && insize && length < utf8_size) { 71 insize--; 72 73 if (*inbuf == 0xFFFE) { 74 inbuf++; 75 continue; 76 } 77 78 if (0x0080 > *inbuf) { 79 /* 1 byte UTF-8 Character.*/ 80 *outbuf++ = (unsigned char)(*inbuf); 81 length++; 82 } else if (0x0800 > *inbuf) { 83 /*2 bytes UTF-8 Character.*/ 84 *outbuf++ = 0xc0 | ((unsigned char)(*inbuf >> 6)); 85 *outbuf++ = 0x80 | ((unsigned char)(*inbuf & 0x3F)); 86 length += 2; 87 88 } else { 89 /* 3 bytes UTF-8 Character .*/ 90 *outbuf++ = 0xE0 | ((unsigned char)(*inbuf >> 12)); 91 *outbuf++ = 0x80 | ((unsigned char)((*inbuf >> 6) & 0x3F)); 92 *outbuf++ = 0x80 | ((unsigned char)(*inbuf & 0x3F)); 93 length += 3; 94 } 95 96 inbuf++; 97 } 98 99 return length; 100 } 101 } 102 103 // Convert Utf-8 to Ucs-2 104 unsigned int Utf8ToUcs2(unsigned char *utf8, unsigned int utf8_size, 105 unsigned short *ucs2, unsigned int ucs2_size) 106 { 107 int length = 0; 108 unsigned int insize = utf8_size; 109 unsigned char *inbuf = utf8; 110 111 if(!utf8) 112 return 0; 113 114 if(!ucs2) { 115 while(*inbuf && insize) { 116 unsigned char c = *inbuf; 117 if((c & 0x80) == 0) { 118 length += 1; 119 insize -= 1; 120 inbuf++; 121 } 122 else if((c & 0xE0) == 0xC0) { 123 length += 1; 124 insize -= 2; 125 inbuf += 2; 126 } else if((c & 0xF0) == 0xE0) { 127 length += 1; 128 insize -= 3; 129 inbuf += 3; 130 } 131 } 132 return length; 133 134 } else { 135 unsigned short *outbuf = ucs2; 136 unsigned int outsize = ucs2_size; 137 138 while(*inbuf && insize && length < outsize) { 139 unsigned char c = *inbuf; 140 if((c & 0x80) == 0) { 141 *outbuf++ = c; 142 inbuf++; 143 length++; 144 insize--; 145 } else if((c & 0xE0) == 0xC0) { 146 unsigned short val; 147 148 val = (c & 0x3F) << 6; 149 inbuf++; 150 c = *inbuf; 151 val |= (c & 0x3F); 152 inbuf++; 153 154 length++; 155 insize -= 2; 156 157 *outbuf++ = val; 158 } else if((c & 0xF0) == 0xE0) { 159 unsigned short val; 160 161 val = (c & 0x1F) << 12; 162 inbuf++; 163 c = *inbuf; 164 val |= (c & 0x3F) << 6; 165 inbuf++; 166 c = *inbuf; 167 val |= (c & 0x3F); 168 inbuf++; 169 170 insize -= 3; 171 length++; 172 173 *outbuf++ = val; 174 } 175 } 176 return length; 177 } 178 return 0; 179 }