字符编码之UCS-2与Utf-8

很多操作系统都直接支持utf-8字符串操作,只有MS这个异类用的Unicode,就是所谓的ucs-2

如果写关于跨平台的代码,那么避免不了要做编码转化

这里贴一下今天写的把Unicode转化为Utf-8的代码

Ucs2BeToUcs2Le负责将大端转化为小端
Ucs2ToUtf8负责将Unicode转化为Utf-8
Utf8ToUcs2负责将Utf-8转化为Unicode

本转化函数只考虑了3个字节以下的编码,需要3个字节以上的同学请自行google了啊
  1 // Convert Unicode big endian to Unicode little endian
  2 unsigned Ucs2BeToUcs2Le(unsigned short *ucs2bige, unsigned int size)
  3 {
  4     printf("%s %d\n", __FUNCTION__, __LINE__);
  5 
  6     if (!ucs2bige) {
  7         return 0;
  8     }
  9     
 10     unsigned int length = size;
 11     unsigned short *tmp = ucs2bige;
 12     
 13     while (*tmp && length) {
 14         
 15         length--;
 16         unsigned char val_high = *tmp >> 8;
 17         unsigned char val_low = (unsigned char)*tmp;
 18         
 19         *tmp = val_low << 8 | val_high;
 20         
 21         tmp++;
 22     }
 23     
 24     return size - length;
 25 }
 26 
 27 // Convert Ucs-2 to Utf-8
 28 unsigned int Ucs2ToUtf8(unsigned short *ucs2, unsigned int ucs2_size, 
 29         unsigned char *utf8, unsigned int utf8_size)
 30 {
 31     unsigned int length = 0;
 32     
 33     if (!ucs2) {
 34         return 0;
 35     }
 36     
 37     unsigned short *inbuf = ucs2;
 38     unsigned char *outbuf = utf8;
 39     
 40     if (*inbuf == 0xFFFE) {
 41         Ucs2BeToUcs2Le(inbuf, ucs2_size);
 42     }
 43     
 44     if (!utf8) {
 45         unsigned int insize = ucs2_size;
 46         
 47         while (*inbuf && insize) {
 48             insize--;
 49             
 50 /*            if (*inbuf == 0xFEFF) {
 51                 inbuf++;
 52                 continue;
 53             }*/
 54             
 55             if (0x0080 > *inbuf) {
 56                 length++;
 57             } else if (0x0800 > *inbuf) {
 58                 length += 2;                
 59             } else {
 60                 length += 3;
 61             }
 62             
 63             inbuf++;
 64         }
 65         return length;
 66         
 67     } else {        
 68         unsigned int insize = ucs2_size;
 69         
 70         while (*inbuf && insize && length < utf8_size) {            
 71             insize--;
 72             
 73             if (*inbuf == 0xFFFE) {
 74                 inbuf++;
 75                 continue;
 76             }
 77             
 78             if (0x0080 > *inbuf) {
 79                 /* 1 byte UTF-8 Character.*/
 80                 *outbuf++ = (unsigned char)(*inbuf);
 81                 length++;
 82             } else if (0x0800 > *inbuf) {
 83                 /*2 bytes UTF-8 Character.*/
 84                 *outbuf++ = 0xc0 | ((unsigned char)(*inbuf >> 6));
 85                 *outbuf++ = 0x80 | ((unsigned char)(*inbuf & 0x3F));
 86                 length += 2;
 87 
 88             } else {
 89                 /* 3 bytes UTF-8 Character .*/
 90                 *outbuf++ = 0xE0 | ((unsigned char)(*inbuf >> 12));
 91                 *outbuf++ = 0x80 | ((unsigned char)((*inbuf >> 6) & 0x3F));
 92                 *outbuf++ = 0x80 | ((unsigned char)(*inbuf & 0x3F));
 93                 length += 3; 
 94             }
 95             
 96             inbuf++;
 97         }
 98         
 99         return length;
100     }
101 }
102 
103 // Convert Utf-8 to Ucs-2 
104 unsigned int Utf8ToUcs2(unsigned char *utf8, unsigned int utf8_size, 
105         unsigned short *ucs2, unsigned int ucs2_size)
106 {
107     int length = 0;
108     unsigned int insize = utf8_size;
109     unsigned char *inbuf = utf8;
110 
111     if(!utf8)
112         return 0;
113 
114     if(!ucs2) {
115         while(*inbuf && insize) {
116             unsigned char c = *inbuf;
117             if((c & 0x80) == 0) {
118                 length += 1;
119                 insize -= 1;
120                 inbuf++;
121             }
122             else if((c & 0xE0) == 0xC0) {
123                 length += 1;
124                 insize -= 2;
125                 inbuf += 2;
126             } else if((c & 0xF0) == 0xE0) {
127                 length += 1;
128                 insize -= 3;
129                 inbuf += 3;
130             }
131         }
132         return length;
133 
134     } else {
135         unsigned short *outbuf = ucs2;
136         unsigned int outsize = ucs2_size;
137 
138         while(*inbuf && insize && length < outsize) {
139             unsigned char c = *inbuf;
140             if((c & 0x80) == 0) {
141                 *outbuf++ = c;
142                 inbuf++;
143                 length++;
144                 insize--;
145             } else if((c & 0xE0) == 0xC0) {
146                 unsigned short val;
147 
148                 val = (c & 0x3F) << 6;
149                 inbuf++;
150                 c = *inbuf;
151                 val |= (c & 0x3F);
152                 inbuf++;
153 
154                 length++;
155                 insize -= 2;
156 
157                 *outbuf++ = val;
158             } else if((c & 0xF0) == 0xE0) {
159                 unsigned short val;
160 
161                 val = (c & 0x1F) << 12;
162                 inbuf++;
163                 c = *inbuf;
164                 val |= (c & 0x3F) << 6;
165                 inbuf++;
166                 c = *inbuf;
167                 val |= (c & 0x3F);
168                 inbuf++;
169 
170                 insize -= 3;
171                 length++;
172 
173                 *outbuf++ = val;
174             }
175         }
176         return length;
177     }
178     return 0;
179 }
posted @ 2012-07-03 13:37  Jojodru  阅读(14093)  评论(0编辑  收藏  举报