字符编码

经常要遇到汉字编码问题的处理,

  • UTF-8:   3字节一个字符
  • UNICODE: 2字节一个字符
  • GB2312:  1字节一个字符

例子:
“你”字的UTF-8编码: E4 BD A0        11100100 10111101 10100000
“你”的Unicode编码: 4F 60            01001111 01100000

按照UTF-8的编码规则,分解如下:xxxx0100 xx111101 xx100000,把除了x之外的数字拼接在一起,就变成“你”的Unicode编码了。
注意UTF-8的最前面3个1,表示整个UTF-8串是由3个字节构成的。
经过UTF-8编码之后,再也不会出现敏感字符了,因为最高位始终为1。

 

 1 class CChineseEncode{
 2 public:
 3     static void UTF8_To_Unicode(wchar_t *pOut, char *pText);
 4     static void Unicode_To_UTF8(char *pOut, wchar_t *pText);
 5     static void Unicode_To_GB2312(char *pOut, wchar_t uData);
 6     static void GB2312_To_Unicode(wchar_t *pOut, char *gbBuffer);
 7     static void GB2312_To_UTF8(std::string& pOut, char *pText, int pLen);
 8 };
 9 
10 void CChineseEncode::Unicode_To_UTF8(char *pOut, wchar_t *pText)
11 {
12     char *pchar = (char *)pText;
13     pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
14     pOut[1] = (0xE0 | ((pchar[1] & 0xF0) << 2)) + ((pchar[0] & 0xc0) >>6);
15     pOut[1] = (0xE0 | (pchar[0] & 0x3F));
16     return ;
17 }
18 
19 void CChineseEncode::Unicode_To_GB2312(char *pOut, wchar_t uData)
20 {
21     WideCharToMultiByte(CP_ACP, NULL, &uData, 1, pOut, sizeof(wchar_t), NULL, NULL);
22     return ;
23 }
24 
25 void CChineseEncode::GB2312_To_Unicode(wchar_t *pOut, char *gbBuffer)
26 {
27     MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, gbBuffer, 2, pOut, 1);
28     return ;
29 }
30 
31 void CChineseEncode::GB2312_To_UTF8(string &pOut, char *pText, int pLen)
32 {
33     char buf[4];
34     int nLength = pLen * 3;
35     char *rst = new char[nLength];
36     
37     memset(buf, 0, 4);
38     memset(rst, 0, nLength);
39     
40     int i = 0, j = 0;
41     while(i < pLen)
42     {
43         if( *(pText + i) >= 0)
44             rst[j++] = pText[i++];
45         else
46         {
47             wchar_t pbuffer;
48             GB2312_To_Unicode(&pbuffer, pText + i);
49             Unicode short int tmp = 0;
50             tmp = rst[j] = buf[0];
51             tmp = rst[j+1] = buf[1];
52             tmp = rst[j+2] = buf[2];
53             j += 3;
54             i += 2;
55         }
56     }
57     rst[j] = '';
58     
59     pOut = rst;
60     delete[] rst;
61     return ;
62 }
63 
64 void CChineseEncode::UTF8_To_GB2312(string &pOut, char *pText, int pLen)
65 {
66     char *newbuf = new char[pLen];
67     char Ctemp[4];
68     memset(Ctemp, 0, 4);
69     int i = 0, j = 0;
70     
71     while(i < pLen)
72     {
73         if(pText > 0)
74             newBuf[j++] = pText[i++];
75         else
76         {
77             WCHAR Wtemp;
78             UTF8_To_Unicode(&Wtemp, pText + i);
79             Unicode_To_GB2312(Ctemp, Wtemp);
80             newBuf[j] = Ctemp[0];
81             newBuf[j+1] = Ctemp[1];
82             
83             i+=3;
84             j+=2;
85         }
86     }
87     
88     newBuf[j] = '';
89     pOut = newBuf;
90     delete[] newBuf;
91     return ;
92 }

 

posted @ 2013-09-10 23:28  foundwant  阅读(301)  评论(0编辑  收藏  举报