通过给定的文件流,判断文件的编码类型
1 /// <summary> 2 /// 通过给定的文件流,判断文件的编码类型 3 /// </summary> 4 /// <param name="fs">文件流</param> 5 /// <returns>文件的编码类型</returns> 6 public static System.Text.Encoding GetEncoding(Stream fs) 7 { 8 byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 }; 9 byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 }; 10 byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM 11 Encoding reVal = Encoding.Default; 12 13 BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default); 14 byte[] ss = r.ReadBytes(4); 15 if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00) 16 { 17 reVal = Encoding.BigEndianUnicode; 18 } 19 else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41) 20 { 21 reVal = Encoding.Unicode; 22 } 23 else 24 { 25 if (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF) 26 { 27 reVal = Encoding.UTF8; 28 } 29 else 30 { 31 int i; 32 int.TryParse(fs.Length.ToString(), out i); 33 ss = r.ReadBytes(i); 34 35 if (IsUTF8Bytes(ss)) 36 reVal = Encoding.UTF8; 37 } 38 } 39 r.Close(); 40 return reVal; 41 42 } 43 44 /// <summary> 45 /// 判断是否是不带 BOM 的 UTF8 格式 46 /// </summary> 47 /// <param name="data"></param> 48 /// <returns></returns> 49 private static bool IsUTF8Bytes(byte[] data) 50 { 51 int charByteCounter = 1; //计算当前正分析的字符应还有的字节数 52 byte curByte; //当前分析的字节. 53 for (int i = 0; i < data.Length; i++) 54 { 55 curByte = data[i]; 56 if (charByteCounter == 1) 57 { 58 if (curByte >= 0x80) 59 { 60 //判断当前 61 while (((curByte <<= 1) & 0x80) != 0) 62 { 63 charByteCounter++; 64 } 65 //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X 66 if (charByteCounter == 1 || charByteCounter > 6) 67 { 68 return false; 69 } 70 } 71 } 72 else 73 { 74 //若是UTF-8 此时第一位必须为1 75 if ((curByte & 0xC0) != 0x80) 76 { 77 return false; 78 } 79 charByteCounter--; 80 } 81 } 82 if (charByteCounter > 1) 83 { 84 throw new Exception("非预期的byte格式!"); 85 } 86 return true; 87 }
工欲善其事,必先利其器。