[個人作品]CharSetConvert 字元集轉換

Posted on 2007-08-30 01:49 黃偉榮阅读(563) 评论(0) 编辑收藏举报

啟源
有一次知道新倉頡，也能輸入特輸字元，只是要用內碼方法輸入(按 ` )，可我那知道內碼是多少，就想說來寫個工具來用用。

開發環境:Window 2003
開發工具:Visual Stutio 2005
開發語言:C++\CLI
開發時間:95年11月(96年8月有作小修改)

Source Download

說明
主要就三種字元集，三種輸出方法的轉換

重要代碼
Convert.h

轉成內碼

static System::String^ ToCode(Encoding^ encode,int toBase,System::String^ text){

StringBuilder ^ sB=gcnew StringBuilder();

Int32 pad;

switch (toBase)

{

case 2:

pad=8;

break;

case 10:

pad=6;

break;

case 16:

pad=2;

break;

}

for(int i=0;i<text->Length;i++)

{

array<Byte >^ b;

sB->AppendFormat("{0} : ", text[i]);

b=encode->GetBytes(Convert::ToString(text[i]));

// 乘256的原因是值是陣列，要轉成數字

if(encode->CodePage==950){

//Big5

if(b->Length==1)

{

sB->AppendLine(Convert::ToString(b[0],toBase)->PadLeft(pad,'0'));

}

else

{

//Big5 陣列大到小

sB->AppendLine(Convert::ToString(b[0]*256+b[1],toBase)->PadLeft(pad*2,'0'));

}

else if(encode->CodePage==1200) //Unicode

{

//陣列小到大

sB->AppendLine(Convert::ToString(b[0]+b[1]*256,toBase)->PadLeft(pad,'0'));

}

else if(encode->CodePage==65001)

{

//UTF8

for ( int j = 0; j < b->Length; j++ ) //UTF8 陣列以直接輸出方式

{

sB->Append(Convert::ToString(b[j],toBase)->PadLeft(pad,'0'));

}

sB->AppendLine();

}else

{

sB->AppendLine("不支援此字元集");

}

return sB->ToString();

}

Convert.h

使用Encoding的GetBytes取得的Bytes

Big5中ASCII 為一個 Byte，中文字為二個Byte，第一個Byte*256 + 第二個Byte 才會等於內碼格式
Unicode都是二個Byte，第一個Byte + 第二個Byte*256 才會等於內碼格式
UTF8不清楚內碼直接以格式輸出

轉成文字

static System::String^ ToWord(Encoding^ encode,int fromBase,System::String^ text)

{

text = text->Replace(" ",""); //去空白

StringBuilder ^ sB=gcnew StringBuilder();

Int32 step,end;

switch (fromBase)

{

case 2:

step=8;

break;

case 10:

step=6;

break;

case 16:

step= encode->CodePage==65001 ? 2 : 4;

break;

}

end = text->Length / step;

if (encode->CodePage==950 || encode->CodePage==1200) //Unicode & Big5

{

for (Int32 i = 0; i < end ; i++)

{

array<Byte>^ buffer = gcnew array<Byte>(2);

String^ ori = text->Substring(i*step,step);

Int32 by = Convert::ToInt32(text->Substring(i*step,step),fromBase);

if (encode->CodePage==950)

{

if (by <= 256)

{

buffer = gcnew array<Byte>(1);

buffer[0] = by;

}

else

{

buffer = gcnew array<Byte>(2);

buffer[0] = (Byte)(by / 256);

buffer[1] = (Byte)(by % 256);

}

else

{

buffer = gcnew array<Byte>(2);

buffer[0] = (Byte)(by % 256);

buffer[1] = (Byte)(by / 256);

}

String^ result = encode->GetString(buffer);

sB->AppendLine(ori + " : " + result);

}

else if (encode->CodePage==65001) //UTF-8 是動態的字元集 1-4 Byte 比較麻煩

{

array<Byte>^ buffer = gcnew array<Byte>(end);

for (Int32 i = 0; i < end ; i++)

{

buffer[i] = Convert::ToByte(text->Substring(i*step,step),fromBase);

}

String^ result = encode->GetString(buffer);

sB->AppendLine(result);

}

else

{

sB->AppendLine("不支援此字元集");

}

return sB->ToString();

}

Convert.h

使用Encoding的GetStrings取得的文字

Big5中ASCII 為一個 Byte，中文字為二個Byte，第一個Byte為Value / 256 + 第二個Byte為Value % 256
Unicode都是二個Byte，第一個Byte為Value % 256 + 第二個Byte為Value / 256
UTF8 Value轉成Bytes

參考資料
Wiki UTF-8 http://zh.wikipedia.org/wiki/UTF-8
Binary, Hexadecimal, UTF-8 Converter http://www.stanford.edu/~hc10/misc/binhexuni.html

刷新页面返回顶部

黃偉榮的學習筆記

[個人作品]CharSetConvert 字元集轉換