黃偉榮的學習筆記

軟體的世界變化萬千,小小的我只能在這洪流奮發向上以求立足。
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

[個人作品]CharSetConvert 字元集轉換

Posted on 2007-08-30 01:49  黃偉榮  阅读(562)  评论(0编辑  收藏  举报
啟源
    有一次知道新倉頡,也能輸入特輸字元,只是要用內碼方法輸入(按 ` ),可我那知道內碼是多少,就想說來寫個工具來用用。

開發環境:Window 2003
開發工具:Visual Stutio 2005
開發語言:C++\CLI
開發時間:95年11月(96年8月有作小修改)

Source Download



說明
主要就三種字元集,三種輸出方法的轉換

重要代碼
Convert.h

轉成內碼
static System::String^ ToCode(Encoding^ encode,int toBase,System::String^  text){

            StringBuilder 
^ sB=gcnew StringBuilder();

            Int32 pad;
            
switch (toBase)
            
{
            
case 2:
                pad
=8;
                
break;
            
case 10:
                pad
=6;
                
break;
            
case 16:
                pad
=2;
                
break;                
            }


            
for(int i=0;i<text->Length;i++)
            
{
                array
<Byte >^ b;

                sB
->AppendFormat("{0} : ", text[i]);

                b
=encode->GetBytes(Convert::ToString(text[i]));

                
// 乘256的原因是值是陣列,要轉成數字
                if(encode->CodePage==950){
                    
//Big5
                    if(b->Length==1)
                    
{                        
                        sB
->AppendLine(Convert::ToString(b[0],toBase)->PadLeft(pad,'0'));
                    }

                    
else
                    
{                        
                        
//Big5 陣列大到小
                        sB->AppendLine(Convert::ToString(b[0]*256+b[1],toBase)->PadLeft(pad*2,'0')); 
                    }

                }

                
else if(encode->CodePage==1200//Unicode
                {    
                    
//陣列小到大
                    sB->AppendLine(Convert::ToString(b[0]+b[1]*256,toBase)->PadLeft(pad,'0')); 
                }

                
else if(encode->CodePage==65001)
                
{
                    
//UTF8
                    for ( int j = 0; j < b->Length; j++ ) //UTF8 陣列 以直接輸出方式
                    {                        
                        sB
->Append(Convert::ToString(b[j],toBase)->PadLeft(pad,'0'));
                    }

                    sB
->AppendLine();
                }
else
                
{
                    sB
->AppendLine("不支援此字元集");
                }

            }

            
return sB->ToString();
        }
Convert.h

使用Encoding的GetBytes取得的Bytes

Big5中ASCII 為一個 Byte,中文字為二個Byte,第一個Byte*256 + 第二個Byte 才會等於內碼格式
Unicode都是二個Byte,第一個Byte + 第二個Byte*256  才會等於內碼格式
UTF8不清楚內碼直接以格式輸出       

轉成文字
static System::String^ ToWord(Encoding^ encode,int fromBase,System::String^  text)
        
{
            text 
= text->Replace(" ",""); //去空白

            StringBuilder 
^ sB=gcnew StringBuilder();

            Int32 step,end;

            
switch (fromBase)
            
{
            
case 2:
                step
=8;
                
break;
            
case 10:
                step
=6;
                
break;
            
case 16:
                step
= encode->CodePage==65001 ? 2 : 4;
                
break;                
            }


            end 
= text->Length / step;

            
if (encode->CodePage==950 || encode->CodePage==1200//Unicode & Big5
            {
                
for (Int32 i = 0; i < end ; i++)
                
{
                    array
<Byte>^ buffer = gcnew array<Byte>(2); 

                    String
^ ori = text->Substring(i*step,step);

                    Int32 by 
= Convert::ToInt32(text->Substring(i*step,step),fromBase);

                    
if (encode->CodePage==950)
                    
{
                        
if (by <= 256)
                        
{
                            buffer 
= gcnew array<Byte>(1);
                            buffer[
0= by;
                        }

                        
else
                        
{
                            buffer 
= gcnew array<Byte>(2);
                            buffer[
0= (Byte)(by / 256);
                            buffer[
1= (Byte)(by % 256);
                        }
                        
                    }

                    
else
                    
{
                        buffer 
= gcnew array<Byte>(2);
                        buffer[
0= (Byte)(by % 256);
                        buffer[
1= (Byte)(by / 256);
                    }


                    String
^ result = encode->GetString(buffer);
                    sB
->AppendLine(ori + " : " + result);
                }

            }

            
else if (encode->CodePage==65001//UTF-8 是動態的字元集 1-4 Byte 比較麻煩
            {
                array
<Byte>^ buffer = gcnew array<Byte>(end); 
                
for (Int32 i = 0; i < end ; i++)
                
{
                    buffer[i] 
= Convert::ToByte(text->Substring(i*step,step),fromBase);
                }
    

                String
^ result = encode->GetString(buffer);                
                sB
->AppendLine(result);
            }

            
else
            
{
                sB
->AppendLine("不支援此字元集");
            }



            
return sB->ToString();
        }

Convert.h

使用Encoding的GetStrings取得的文字

Big5中ASCII 為一個 Byte,中文字為二個Byte,第一個Byte為Value / 256 + 第二個Byte為Value % 256
Unicode都是二個Byte,第一個Byte為Value % 256 + 第二個Byte為Value / 256
UTF8 Value轉成Bytes

參考資料
Wiki UTF-8   http://zh.wikipedia.org/wiki/UTF-8
Binary, Hexadecimal, UTF-8 Converter http://www.stanford.edu/~hc10/misc/binhexuni.html