前言
我们知道,.NET Framework 在内部将文本(string)存储为 Unicode UTF-16。在 .NET Framework Base Class Library 中,System.Text.Encoding 类及其派生类提供了对字符编码的支持。Encoding 类的静态 GetEncodings 方法返回包含所有编码的数组。
源程序代码
让我们写个 C# 程序来查看一下 BCL 所支持的所有字符编码吧。下面就是 EncodingTester.cs:
01: using System; 02: using System.Data; 03: using System.Text; 04: 05: namespace Skyiv.Tester 06: { 07: sealed class EncodingTester 08: { 09: static void Main() 10: { 11: try 12: { 13: var info = Utilities.GetEnvironmentInfo(); 14: info.Add(Tuple.Create("字符编码数量", Encoding.GetEncodings().Length.ToString())); 15: var html = new HtmlMaker("字符编码信息", info, GetEncodingInfo().DefaultView); 16: html.TitleVisible = false; 17: html.Save("EncodingTester.html"); 18: } 19: catch (Exception ex) 20: { 21: Console.WriteLine(ex); 22: } 23: } 24: 25: static DataTable GetEncodingInfo() 26: { 27: var table = new DataTable(); 28: table.Columns.Add("CodePage", typeof(int)); 29: table.Columns.Add("WebName", typeof(string)); 30: table.Columns.Add("EncodingName", typeof(string)); 31: table.Columns.Add("Class", typeof(string)); 32: table.Columns.Add("Memo", typeof(string)); 33: foreach (var info in Encoding.GetEncodings()) 34: { 35: var dr = table.NewRow(); 36: var encoding = info.GetEncoding(); 37: dr[0] = encoding.CodePage; 38: dr[1] = encoding.WebName; 39: dr[2] = encoding.EncodingName; 40: dr[3] = GetTypeName(encoding); 41: dr[4] = GetMemo(encoding); 42: table.Rows.Add(dr); 43: } 44: return table; 45: } 46: 47: static string GetMemo(Encoding encoding) 48: { 49: var memo = ""; 50: if (!encoding.IsReadOnly) memo += "Writeable "; 51: if (encoding.IsSingleByte) memo += "1-Byte "; 52: if (encoding.IsBrowserDisplay) memo += "BrDisp "; 53: if (encoding.IsBrowserSave) memo += "BrSave "; 54: if (encoding.IsMailNewsDisplay) memo += "MNDisp "; 55: if (encoding.IsMailNewsSave) memo += "MNSave "; 56: if (encoding.IsAlwaysNormalized()) memo += "Norm "; 57: var bs = encoding.GetPreamble(); 58: if (bs.Length != 0) memo += "BOM:" + BitConverter.ToString(bs); 59: return memo; 60: } 61: 62: static string GetTypeName(Encoding encoding) 63: { 64: var type = encoding.GetType(); 65: var name = type.ToString(); 66: var prefix = "System.Text."; 67: if (name.StartsWith(prefix)) name = ":" + name.Substring(prefix.Length); 68: if (type.IsNotPublic) name = "(" + name + ")"; 69: return name; 70: } 71: } 72: }
编译响应文件如下所示:
-r:System.Data.dll -r:System.Drawing.dll EncodingTester.cs ExtensionMethods.cs HtmlMaker.cs HtmlTable.cs RuntimeFramework.cs Utilities.cs
在 Windows 操作系统中编译和运行
在 Windows Vista 操作系统的 .NET Framework 4 环境中编译和运行:
E:\CS\EncodingTester> csc @build.rsp Microsoft(R) Visual C# 2010 编译器 4.0.30319.1 版 版权所有(C) Microsoft Corporation。保留所有权利。 E:\CS\EncodingTester> EncodingTester E:\CS\EncodingTester>
这个程序的运行结果是产生一个 EncodingTester.html 文件,其内容如下所示:
操作系统 | Microsoft Windows NT 6.0.6002 Service Pack 2 |
公共语言运行库 | 4.0.30319.1 [Net 4.0.30319.1] |
默认字符编码 | 简体中文(GB2312) [936:gb2312] |
字节顺序 | Little-Endian |
字符编码数量 | 140 |
CodePage | WebName | EncodingName | Class | Memo |
---|---|---|---|---|
37 | IBM037 | IBM EBCDIC (美国-加拿大) | (:SBCSCodePageEncoding) | 1-Byte Norm |
437 | IBM437 | OEM 美国 | (:SBCSCodePageEncoding) | 1-Byte Norm |
500 | IBM500 | IBM EBCDIC (国际) | (:SBCSCodePageEncoding) | 1-Byte Norm |
708 | ASMO-708 | 阿拉伯字符(ASMO-708) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave |
720 | DOS-720 | 阿拉伯字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave Norm |
737 | ibm737 | 希腊字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
775 | ibm775 | 波罗的海字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
850 | ibm850 | 西欧字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
852 | ibm852 | 中欧字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave Norm |
855 | IBM855 | OEM 西里尔语 | (:SBCSCodePageEncoding) | 1-Byte Norm |
857 | ibm857 | 土耳其字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte |
858 | IBM00858 | OEM 多语言拉丁语 I | (:SBCSCodePageEncoding) | 1-Byte Norm |
860 | IBM860 | 葡萄牙语(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
861 | ibm861 | 冰岛语(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
862 | DOS-862 | 希伯来字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave Norm |
863 | IBM863 | 加拿大法语(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
864 | IBM864 | 阿拉伯字符(864) | (:SBCSCodePageEncoding) | 1-Byte |
865 | IBM865 | 北欧字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
866 | cp866 | 西里尔字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave Norm |
869 | ibm869 | 现代希腊字符(DOS) | (:SBCSCodePageEncoding) | 1-Byte Norm |
870 | IBM870 | IBM EBCDIC (多语言拉丁语 2) | (:SBCSCodePageEncoding) | 1-Byte Norm |
874 | windows-874 | 泰语(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
875 | cp875 | IBM EBCDIC (现代希腊语) | (:SBCSCodePageEncoding) | 1-Byte |
932 | shift_jis | 日语(Shift-JIS) | (:DBCSCodePageEncoding) | BrDisp BrSave MNDisp MNSave |
936 | gb2312 | 简体中文(GB2312) | (:DBCSCodePageEncoding) | BrDisp BrSave MNDisp MNSave |
949 | ks_c_5601-1987 | 朝鲜语 | (:DBCSCodePageEncoding) | BrDisp BrSave MNDisp MNSave |
950 | big5 | 繁体中文(Big5) | (:DBCSCodePageEncoding) | BrDisp BrSave MNDisp MNSave |
1026 | IBM1026 | IBM EBCDIC (土耳其拉丁语 5) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1047 | IBM01047 | IBM 拉丁语 1 | (:SBCSCodePageEncoding) | 1-Byte Norm |
1140 | IBM01140 | IBM EBCDIC (美国-加拿大-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1141 | IBM01141 | IBM EBCDIC (德国-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1142 | IBM01142 | IBM EBCDIC (丹麦-挪威-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1143 | IBM01143 | IBM EBCDIC (芬兰-瑞典-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1144 | IBM01144 | IBM EBCDIC (意大利-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1145 | IBM01145 | IBM EBCDIC (西班牙-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1146 | IBM01146 | IBM EBCDIC (英国-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1147 | IBM01147 | IBM EBCDIC (法国-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1148 | IBM01148 | IBM EBCDIC (国际-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1149 | IBM01149 | IBM EBCDIC (冰岛语-欧洲) | (:SBCSCodePageEncoding) | 1-Byte Norm |
1200 | utf-16 | Unicode | :UnicodeEncoding | BrSave BOM:FF-FE |
1201 | utf-16BE | Unicode (Big-Endian) | :UnicodeEncoding | BOM:FE-FF |
1250 | windows-1250 | 中欧字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1251 | windows-1251 | 西里尔字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1252 | Windows-1252 | 西欧字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1253 | windows-1253 | 希腊字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
1254 | windows-1254 | 土耳其字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1255 | windows-1255 | 希伯来字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
1256 | windows-1256 | 阿拉伯字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1257 | windows-1257 | 波罗的海字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
1258 | windows-1258 | 越南字符(Windows) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
1361 | Johab | 朝鲜语(Johab) | (:DBCSCodePageEncoding) | |
10000 | macintosh | 西欧字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10001 | x-mac-japanese | 日语(Mac) | (:DBCSCodePageEncoding) | |
10002 | x-mac-chinesetrad | 繁体中文(Mac) | (:DBCSCodePageEncoding) | |
10003 | x-mac-korean | 朝鲜语(Mac) | (:DBCSCodePageEncoding) | |
10004 | x-mac-arabic | 阿拉伯字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10005 | x-mac-hebrew | 希伯来字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10006 | x-mac-greek | 希腊字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10007 | x-mac-cyrillic | 西里尔字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte Norm |
10008 | x-mac-chinesesimp | 简体中文(Mac) | (:DBCSCodePageEncoding) | |
10010 | x-mac-romanian | 罗马尼亚语(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10017 | x-mac-ukrainian | 乌克兰语(Mac) | (:SBCSCodePageEncoding) | 1-Byte Norm |
10021 | x-mac-thai | 泰语(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10029 | x-mac-ce | 中欧字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte Norm |
10079 | x-mac-icelandic | 冰岛语(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10081 | x-mac-turkish | 土耳其字符(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
10082 | x-mac-croatian | 克罗地亚语(Mac) | (:SBCSCodePageEncoding) | 1-Byte |
12000 | utf-32 | Unicode (UTF-32) | :UTF32Encoding | BOM:FF-FE-00-00 |
12001 | utf-32BE | Unicode (UTF-32 Big-Endian) | :UTF32Encoding | BOM:00-00-FE-FF |
20000 | x-Chinese-CNS | 繁体中文(CNS) | (:DBCSCodePageEncoding) | |
20001 | x-cp20001 | TCA 台湾 | (:DBCSCodePageEncoding) | |
20002 | x-Chinese-Eten | 繁体中文(Eten) | (:DBCSCodePageEncoding) | |
20003 | x-cp20003 | IBM5550 台湾 | (:DBCSCodePageEncoding) | |
20004 | x-cp20004 | TeleText 台湾 | (:DBCSCodePageEncoding) | |
20005 | x-cp20005 | Wang 台湾 | (:DBCSCodePageEncoding) | |
20105 | x-IA5 | 西欧字符(IA5) | (:SBCSCodePageEncoding) | 1-Byte |
20106 | x-IA5-German | 德语(IA5) | (:SBCSCodePageEncoding) | 1-Byte |
20107 | x-IA5-Swedish | 瑞典语(IA5) | (:SBCSCodePageEncoding) | 1-Byte |
20108 | x-IA5-Norwegian | 挪威语(IA5) | (:SBCSCodePageEncoding) | 1-Byte |
20127 | us-ascii | US-ASCII | :ASCIIEncoding | 1-Byte MNDisp MNSave |
20261 | x-cp20261 | T.61 | (:DBCSCodePageEncoding) | |
20269 | x-cp20269 | ISO-6937 | (:SBCSCodePageEncoding) | 1-Byte |
20273 | IBM273 | IBM EBCDIC (德国) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20277 | IBM277 | IBM EBCDIC (丹麦-挪威) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20278 | IBM278 | IBM EBCDIC (芬兰-瑞典) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20280 | IBM280 | IBM EBCDIC (意大利) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20284 | IBM284 | IBM EBCDIC (西班牙) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20285 | IBM285 | IBM EBCDIC (UK) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20290 | IBM290 | IBM EBCDIC (日语片假名) | (:SBCSCodePageEncoding) | 1-Byte |
20297 | IBM297 | IBM EBCDIC (法国) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20420 | IBM420 | IBM EBCDIC (阿拉伯语) | (:SBCSCodePageEncoding) | 1-Byte |
20423 | IBM423 | IBM EBCDIC (希腊语) | (:SBCSCodePageEncoding) | 1-Byte |
20424 | IBM424 | IBM EBCDIC (希伯来语) | (:SBCSCodePageEncoding) | 1-Byte |
20833 | x-EBCDIC-KoreanExtended | IBM EBCDIC (朝鲜语扩展) | (:SBCSCodePageEncoding) | 1-Byte |
20838 | IBM-Thai | IBM EBCDIC (泰语) | (:SBCSCodePageEncoding) | 1-Byte |
20866 | koi8-r | 西里尔字符(KOI8-R) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
20871 | IBM871 | IBM EBCDIC (冰岛语) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20880 | IBM880 | IBM EBCDIC (西里尔俄语) | (:SBCSCodePageEncoding) | 1-Byte Norm |
20905 | IBM905 | IBM EBCDIC (土耳其语) | (:SBCSCodePageEncoding) | 1-Byte |
20924 | IBM00924 | IBM 拉丁语 1 | (:SBCSCodePageEncoding) | 1-Byte Norm |
20932 | EUC-JP | 日语(JIS 0208-1990 和 0212-1990) | (:DBCSCodePageEncoding) | |
20936 | x-cp20936 | 简体中文(GB2312-80) | (:DBCSCodePageEncoding) | |
20949 | x-cp20949 | 朝鲜语 Wansung | (:DBCSCodePageEncoding) | |
21025 | cp1025 | IBM EBCDIC (西里尔塞尔维亚-保加利亚语) | (:SBCSCodePageEncoding) | 1-Byte Norm |
21866 | koi8-u | 西里尔字符(KOI8-U) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28591 | iso-8859-1 | 西欧字符(ISO) | (:Latin1Encoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28592 | iso-8859-2 | 中欧字符(ISO) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28593 | iso-8859-3 | 拉丁语 3 (ISO) | (:SBCSCodePageEncoding) | 1-Byte MNDisp MNSave |
28594 | iso-8859-4 | 波罗的海字符(ISO) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28595 | iso-8859-5 | 西里尔字符(ISO) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28596 | iso-8859-6 | 阿拉伯字符(ISO) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
28597 | iso-8859-7 | 希腊字符(ISO) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
28598 | iso-8859-8 | 希伯来字符(ISO-Visual) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave |
28599 | iso-8859-9 | 土耳其字符(ISO) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28603 | iso-8859-13 | 爱沙尼亚语(ISO) | (:SBCSCodePageEncoding) | 1-Byte MNDisp MNSave Norm |
28605 | iso-8859-15 | 拉丁语 9 (ISO) | (:SBCSCodePageEncoding) | 1-Byte BrSave MNDisp MNSave Norm |
29001 | x-Europa | 欧罗巴 | (:SBCSCodePageEncoding) | 1-Byte |
38598 | iso-8859-8-i | 希伯来字符(ISO-Logical) | (:SBCSCodePageEncoding) | 1-Byte BrDisp BrSave MNDisp MNSave |
50220 | iso-2022-jp | 日语(JIS) | (:ISO2022Encoding) | MNDisp MNSave |
50221 | csISO2022JP | 日语(JIS-允许 1 字节假名) | (:ISO2022Encoding) | BrSave MNDisp MNSave |
50222 | iso-2022-jp | 日语(JIS-允许 1 字节假名 - SO/SI) | (:ISO2022Encoding) | |
50225 | iso-2022-kr | 朝鲜语(ISO) | (:ISO2022Encoding) | MNDisp |
50227 | x-cp50227 | 简体中文(ISO-2022) | (:DBCSCodePageEncoding) | |
51932 | euc-jp | 日语(EUC) | (:EUCJPEncoding) | BrDisp BrSave MNDisp MNSave |
51936 | EUC-CN | 简体中文(EUC) | (:DBCSCodePageEncoding) | |
51949 | euc-kr | 朝鲜语(EUC) | (:DBCSCodePageEncoding) | MNDisp MNSave |
52936 | hz-gb-2312 | 简体中文(HZ) | (:ISO2022Encoding) | BrDisp BrSave MNDisp MNSave |
54936 | GB18030 | 简体中文(GB18030) | (:GB18030Encoding) | BrDisp BrSave MNDisp MNSave |
57002 | x-iscii-de | ISCII 梵文 | (:ISCIIEncoding) | |
57003 | x-iscii-be | ISCII 孟加拉语 | (:ISCIIEncoding) | |
57004 | x-iscii-ta | ISCII 泰米尔语 | (:ISCIIEncoding) | |
57005 | x-iscii-te | ISCII 泰卢固语 | (:ISCIIEncoding) | |
57006 | x-iscii-as | ISCII 阿萨姆语 | (:ISCIIEncoding) | |
57007 | x-iscii-or | ISCII 奥里雅语 | (:ISCIIEncoding) | |
57008 | x-iscii-ka | ISCII 卡纳达语 | (:ISCIIEncoding) | |
57009 | x-iscii-ma | ISCII 马拉雅拉姆语 | (:ISCIIEncoding) | |
57010 | x-iscii-gu | ISCII 古吉拉特语 | (:ISCIIEncoding) | |
57011 | x-iscii-pa | ISCII 旁遮普语 | (:ISCIIEncoding) | |
65000 | utf-7 | Unicode (UTF-7) | :UTF7Encoding | MNDisp MNSave |
65001 | utf-8 | Unicode (UTF-8) | :UTF8Encoding | BrDisp BrSave MNDisp MNSave BOM:EF-BB-BF |
几点说明:
- BOM: Byte order mark,用以标记字符编码的字节顺序,来自 Encoding.GetPreamble 方法。
- BrDisp: 指示浏览器客户端是否可以使用当前的编码显示内容,来自 Encoding.IsBrowserDisplay 属性。
- BrSave: 指示浏览器客户端是否可以使用当前的编码保存内容,来自 Encoding.IsBrowserSave 属性。
- MNDisp: 指示邮件和新闻客户端是否可以使用当前的编码显示内容,来自 Encoding.IsMailNewsDisplay 属性。
- MNSave: 指示邮件和新闻客户端是否可以使用当前的编码保存内容,来自 Encoding.IsMailNewsSave 属性。
- Norm: 指示当前编码是否始终被规范化,来自 Encoding.IsAlwaysNormalized 方法。
- Writeable: 指示当前编码不是只读的,来自 Encoding.IsReadOnly 属性。(上表中未出现)
- 1-Byte: 指示当前的编码是否使用单字节码位,来自 Encoding.IsSingleByte 属性。
- Class 栏中类名前名的“:”表示 System.Text 命名空间。类名如果被小括号括住,表示非公共类型(Type.IsNotPublic)。
可以看出,在我们的中文版的 Windows 操作系统中,默认的字符编码就是著名的“简体中文(GB2312) [936:gb2312]”。系统所支持的字符编码数量是 140 个。其实常用的只有以下三个:
- Unicode (UTF-8) [65001:utf-8], Encoding.UTF8
- Unicode [1200:utf-16], Encoding.Unicode
- 简体中文(GB18030) [54936:GB18030], Encoding.GetEncoding(“GB18030”)
注意,我没有把 Windows 操作系统默认的字符编码 GB2312 列入常用的,是因为它完全可以被 GB18030 取代。
在 Linux 操作系统中编译和运行
在 Ubuntu 10.10 Server 操作系统的 mono 2.8.2 环境中编译和运行:
ben@D1520:~/work/EncodingTester$ dmcs @build.rsp ben@D1520:~/work/EncodingTester$ mono28 EncodingTester.exe
这次生成的 EncodingTester.html 文件的内容如下所示:
操作系统 | Unix 2.6.35.24 |
公共语言运行库 | 4.0.30319.1 [2.8.2 (tarball Mon Jan 10 19:16:25 CST 2011)] |
默认字符编码 | Unicode (UTF-8) [65001:utf-8] |
字节顺序 | Little-Endian |
字符编码数量 | 95 |
CodePage | WebName | EncodingName | Class | Memo |
---|---|---|---|---|
37 | IBM037 | IBM EBCDIC (US-Canada) | I18N.Rare.CP37 | 1-Byte Norm |
437 | IBM437 | OEM United States | I18N.West.CP437 | 1-Byte Norm |
500 | IBM500 | IBM EBCDIC (International) | I18N.Rare.CP500 | 1-Byte Norm |
708 | asmo-708 | Arabic (ASMO 708) | I18N.Rare.CP708 | 1-Byte Norm |
850 | ibm850 | Western European (DOS) | I18N.West.CP850 | 1-Byte Norm |
852 | ibm852 | Central European (DOS) | I18N.Rare.CP852 | 1-Byte Norm |
855 | ibm855 | Cyrillic (DOS) | I18N.Rare.CP855 | 1-Byte Norm |
857 | ibm857 | Turkish (DOS) | I18N.Rare.CP857 | 1-Byte Norm |
858 | IBM00858 | Western European (DOS with Euro) | I18N.Rare.CP858 | 1-Byte Norm |
860 | ibm860 | Portuguese (DOS) | I18N.West.CP860 | 1-Byte Norm |
861 | ibm861 | Icelandic (DOS) | I18N.West.CP861 | 1-Byte Norm |
862 | ibm862 | Hebrew (DOS) | I18N.Rare.CP862 | 1-Byte Norm |
863 | IBM863 | French Canadian (DOS) | I18N.West.CP863 | 1-Byte Norm |
864 | ibm864 | Arabic (DOS) | I18N.Rare.CP864 | 1-Byte Norm |
865 | IBM865 | Nordic (DOS) | I18N.West.CP865 | 1-Byte Norm |
866 | ibm866 | Russian (DOS) | I18N.Rare.CP866 | 1-Byte Norm |
869 | ibm869 | Greek (DOS) | I18N.Rare.CP869 | 1-Byte |
870 | ibm870 | IBM EBCDIC (Latin 2) | I18N.Rare.CP870 | 1-Byte Norm |
874 | windows-874 | Thai (Windows) | I18N.Other.CP874 | 1-Byte BrDisp BrSave MNDisp MNSave |
875 | ibm875 | IBM EBCDIC (Greek) | I18N.Rare.CP875 | 1-Byte |
932 | shift_jis | Japanese (Shift-JIS) | I18N.CJK.CP932 | BrDisp BrSave MNDisp MNSave |
936 | gb2312 | Chinese Simplified (GB2312) | (I18N.CJK.CP936) | BrDisp BrSave MNDisp MNSave |
949 | ks_c_5601-1987 | Korean (UHC) | (I18N.CJK.CP949) | BrDisp BrSave MNDisp MNSave |
950 | big5 | Chinese Traditional (Big5) | (I18N.CJK.CP950) | BrDisp BrSave MNDisp MNSave |
1026 | ibm1026 | IBM EBCDIC (Turkish) | I18N.Rare.CP1026 | 1-Byte Norm |
1047 | ibm1047 | IBM EBCDIC (Open Systems Latin 1) | I18N.Rare.CP1047 | 1-Byte Norm |
1140 | IBM01140 | IBM EBCDIC (US-Canada with Euro) | I18N.Rare.CP1140 | 1-Byte Norm |
1141 | IBM01141 | IBM EBCDIC (Germany with Euro) | I18N.Rare.CP1141 | 1-Byte Norm |
1142 | IBM01142 | IBM EBCDIC (Denmark/Norway with Euro) | I18N.Rare.CP1142 | 1-Byte Norm |
1143 | IBM01143 | IBM EBCDIC (Finland/Sweden with Euro) | I18N.Rare.CP1143 | 1-Byte Norm |
1144 | ibm1144 | IBM EBCDIC (Italy with Euro) | I18N.Rare.CP1144 | 1-Byte Norm |
1145 | ibm1145 | IBM EBCDIC (Latin America/Spain with Euro) | I18N.Rare.CP1145 | 1-Byte Norm |
1146 | ibm1146 | IBM EBCDIC (United Kingdom with Euro) | I18N.Rare.CP1146 | 1-Byte Norm |
1147 | ibm1147 | IBM EBCDIC (France with Euro) | I18N.Rare.CP1147 | 1-Byte Norm |
1148 | ibm1148 | IBM EBCDIC (International with Euro) | I18N.Rare.CP1148 | 1-Byte Norm |
1149 | ibm1149 | IBM EBCDIC (Icelandic with Euro) | I18N.Rare.CP1149 | 1-Byte Norm |
1200 | utf-16 | Unicode | :UnicodeEncoding | BrSave BOM:FF-FE |
1201 | unicodeFFFE | Unicode (Big-Endian) | :UnicodeEncoding | BOM:FE-FF |
1250 | windows-1250 | Central European (Windows) | I18N.West.CP1250 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1251 | windows-1251 | Cyrillic (Windows) | I18N.Other.CP1251 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1252 | Windows-1252 | Western European (Windows) | I18N.West.CP1252 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1253 | windows-1253 | Greek (Windows) | I18N.West.CP1253 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1254 | windows-1254 | Turkish (Windows) | I18N.MidEast.CP1254 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1255 | windows-1255 | Hebrew (Windows) | I18N.MidEast.CP1255 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1256 | windows-1256 | Arabic (Windows) | I18N.MidEast.CP1256 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1257 | windows-1257 | Baltic (Windows) | I18N.Other.CP1257 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
1258 | windows-1258 | Vietnamese (Windows) | I18N.Other.CP1258 | 1-Byte BrDisp BrSave MNDisp MNSave |
10000 | macintosh | Western European (Mac) | I18N.West.CP10000 | 1-Byte |
10079 | x-mac-icelandic | Icelandic (Mac) | I18N.West.CP10079 | 1-Byte |
12000 | utf-32 | UTF-32 | :UTF32Encoding | BOM:FF-FE-00-00 |
12001 | utf-32BE | UTF-32 (Big-Endian) | :UTF32Encoding | BOM:00-00-FE-FF |
20127 | us-ascii | US-ASCII | :ASCIIEncoding | 1-Byte MNDisp MNSave Norm |
20273 | IBM273 | IBM EBCDIC (Germany) | I18N.Rare.CP20273 | 1-Byte Norm |
20277 | IBM277 | IBM EBCDIC (Denmark/Norway) | I18N.Rare.CP20277 | 1-Byte Norm |
20278 | IBM278 | IBM EBCDIC (Finland/Sweden) | I18N.Rare.CP20278 | 1-Byte Norm |
20280 | IBM280 | IBM EBCDIC (Italy) | I18N.Rare.CP20280 | 1-Byte Norm |
20284 | IBM284 | IBM EBCDIC (Latin America/Spain) | I18N.Rare.CP20284 | 1-Byte Norm |
20285 | IBM285 | IBM EBCDIC (United Kingdom) | I18N.Rare.CP20285 | 1-Byte Norm |
20290 | IBM290 | IBM EBCDIC (Japanese Katakana Extended) | I18N.Rare.CP20290 | 1-Byte Norm |
20297 | IBM297 | IBM EBCDIC (France) | I18N.Rare.CP20297 | 1-Byte Norm |
20420 | IBM420 | IBM EBCDIC (Arabic) | I18N.Rare.CP20420 | 1-Byte Norm |
20424 | IBM424 | IBM EBCDIC (Hebrew) | I18N.Rare.CP20424 | 1-Byte Norm |
20866 | koi8-r | Cyrillic (KOI8-R) | I18N.Other.CP20866 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
20871 | IBM871 | IBM EBCDIC (Icelandic) | I18N.Rare.CP20871 | 1-Byte Norm |
21025 | IBM1025 | IBM EBCDIC (Cyrillic - Serbian, Bulgarian) | I18N.Rare.CP21025 | 1-Byte Norm |
21866 | koi8-u | Ukrainian (KOI8-U) | I18N.Other.CP21866 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28591 | iso-8859-1 | Western European (ISO) | (:Latin1Encoding) | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28592 | iso-8859-2 | Central European (ISO) | I18N.West.CP28592 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28593 | iso-8859-3 | Latin 3 (ISO) | I18N.West.CP28593 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28594 | iso-8859-4 | Baltic (ISO) | I18N.Other.CP28594 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28595 | iso-8859-5 | Cyrillic (ISO) | I18N.Other.CP28595 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28596 | iso-8859-6 | Arabic (ISO) | I18N.MidEast.CP28596 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28597 | iso-8859-7 | Greek (ISO) | I18N.West.CP28597 | 1-Byte BrDisp BrSave MNDisp MNSave |
28598 | iso-8859-8 | Hebrew (ISO) | I18N.MidEast.CP28598 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28599 | iso-8859-9 | Latin 5 (ISO) | I18N.MidEast.CP28599 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
28605 | iso-8859-15 | Latin 9 (ISO) | I18N.West.CP28605 | 1-Byte BrSave MNDisp MNSave Norm |
38598 | windows-38598 | Hebrew (ISO Alternative) | I18N.MidEast.CP38598 | 1-Byte BrDisp BrSave MNDisp MNSave Norm |
50220 | csISO2022JP | Japanese (JIS) | I18N.CJK.CP50220 | |
50221 | csISO2022JP | Japanese (JIS-Allow 1 byte Kana) | I18N.CJK.CP50221 | |
50222 | csISO2022JP | Japanese (JIS-Allow 1 byte Kana - SO/SI) | I18N.CJK.CP50222 | |
51932 | euc-jp | Japanese (EUC) | I18N.CJK.CP51932 | BrDisp BrSave MNDisp MNSave |
51949 | euc-kr | Korean (EUC) | (I18N.CJK.CP51949) | BrDisp BrSave MNDisp MNSave |
54936 | GB18030 | Chinese Simplified (GB18030) | I18N.CJK.CP54936 | BrDisp BrSave MNDisp MNSave |
57002 | x-iscii-de | ISCII Devanagari | I18N.Other.CP57002 | |
57003 | x-iscii-be | ISCII Bengali | I18N.Other.CP57003 | |
57004 | x-iscii-ta | ISCII Tamil | I18N.Other.CP57004 | |
57005 | x-iscii-te | ISCII Telugu | I18N.Other.CP57005 | |
57006 | x-iscii-as | ISCII Assamese | I18N.Other.CP57006 | |
57007 | x-iscii-or | ISCII Oriya | I18N.Other.CP57007 | |
57008 | x-iscii-ka | ISCII Kannada | I18N.Other.CP57008 | |
57009 | x-iscii-ma | ISCII Malayalam | I18N.Other.CP57009 | |
57010 | x-iscii-gu | ISCII Gujarati | I18N.Other.CP57010 | |
57011 | x-iscii-pa | ISCII Punjabi | I18N.Other.CP57011 | |
65000 | utf-7 | Unicode (UTF-7) | :UTF7Encoding | MNDisp MNSave |
65001 | utf-8 | Unicode (UTF-8) | :UTF8Encoding | BrDisp BrSave MNDisp MNSave BOM:EF-BB-BF |
可以看出,在我们的 Ubuntu 操作系统中,默认字符编码是“Unicode (UTF-8) [65001:utf-8]”。系统所支持的字符编码数量减少到 95 个。还可以看到,在 Mono 中,很多表示字符编码的类不再位于 System.Text 命名空间中,而是在 I18N 底下的命名空间中。而且大部分都是公共类型。
测试用的 Ubuntu 10.10 Server 操作系统是英文版的,但不影响它有 GB18030 等中文编码。该操作系统的安装过程请参见“从源代码编译 Mono 2.8.2 是否需要低版本的 Mono”一文。
其他源程序
我们前面只给出一个最主要的 C# 源程序 EncodingTester.cs。其余的源程序现在给出,下面就是 Utilities.cs:
01: using System; 02: using System.Text; 03: using System.Collections.Generic; 04: 05: namespace Skyiv 06: { 07: public static class Utilities 08: { 09: public static List<Tuple<string, string>> GetEnvironmentInfo() 10: { 11: var info = new List<Tuple<string, string>>(); 12: Add(info, "操作系统", Environment.OSVersion); 13: Add(info, "公共语言运行库", Environment.Version + " [" + RuntimeFramework.CurrentFramework + "]"); 14: Add(info, "默认字符编码", GetDefaultEncoding()); 15: Add(info, "字节顺序", (BitConverter.IsLittleEndian ? "Little" : "Big") + "-Endian"); 16: return info; 17: } 18: 19: static string GetDefaultEncoding() 20: { 21: var encoding = Encoding.Default; 22: return string.Format("{0} [{1}:{2}]", encoding.EncodingName, encoding.CodePage, encoding.WebName); 23: } 24: 25: static void Add(List<Tuple<string, string>> info, string key, object value) 26: { 27: info.Add(Tuple.Create(key, value.ToString())); 28: } 29: } 30: }
而 RuntimeFramework.cs 请参见“.NET Framework CLR 版本检测”一文。HtmlMaker.cs、HtmlTable.cs 和 ExtensionMethods.cs 请参见“以 Html 表格展现数据”一文。