通过给定的文件流,判断文件的编码类型

 1 /// <summary>
 2         /// 通过给定的文件流,判断文件的编码类型
 3         /// </summary>
 4         /// <param name="fs">文件流</param>
 5         /// <returns>文件的编码类型</returns>
 6         public static System.Text.Encoding GetEncoding(Stream fs)
 7         {
 8             byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
 9             byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
10             byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM
11             Encoding reVal = Encoding.Default;
12 
13             BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
14             byte[] ss = r.ReadBytes(4);
15             if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
16             {
17                 reVal = Encoding.BigEndianUnicode;
18             }
19             else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
20             {
21                 reVal = Encoding.Unicode;
22             }
23             else
24             {
25                 if (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)
26                 {
27                     reVal = Encoding.UTF8;
28                 }
29                 else
30                 {
31                     int i;
32                     int.TryParse(fs.Length.ToString(), out i);
33                     ss = r.ReadBytes(i);
34 
35                     if (IsUTF8Bytes(ss))
36                         reVal = Encoding.UTF8;
37                 }
38             }
39             r.Close();
40             return reVal;
41 
42         }
43 
44         /// <summary>
45         /// 判断是否是不带 BOM 的 UTF8 格式
46         /// </summary>
47         /// <param name="data"></param>
48         /// <returns></returns>
49         private static bool IsUTF8Bytes(byte[] data)
50         {
51             int charByteCounter = 1;  //计算当前正分析的字符应还有的字节数
52             byte curByte; //当前分析的字节.
53             for (int i = 0; i < data.Length; i++)
54             {
55                 curByte = data[i];
56                 if (charByteCounter == 1)
57                 {
58                     if (curByte >= 0x80)
59                     {
60                         //判断当前
61                         while (((curByte <<= 1) & 0x80) != 0)
62                         {
63                             charByteCounter++;
64                         }
65                         //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X 
66                         if (charByteCounter == 1 || charByteCounter > 6)
67                         {
68                             return false;
69                         }
70                     }
71                 }
72                 else
73                 {
74                     //若是UTF-8 此时第一位必须为1
75                     if ((curByte & 0xC0) != 0x80)
76                     {
77                         return false;
78                     }
79                     charByteCounter--;
80                 }
81             }
82             if (charByteCounter > 1)
83             {
84                 throw new Exception("非预期的byte格式!");
85             }
86             return true;
87         }
View Code

 

posted @ 2013-06-11 19:20  宁静.致远  阅读(406)  评论(0编辑  收藏  举报