判断文件是否为UTF-8编码(以前收集的)

  1        private bool CheckEncoding(string strFileName)
  2        {
  3            using (FileStream stream = new FileStream(strFileName, FileMode.Open))
  4            {
  5                byte[] bs = new byte[stream.Length];
  6                stream.Read(bs, 0, bs.Length);
  7                if (utf8_probability(bs) > 0return true;
  8                else return false;
  9
 10                /*
 11                if (stream != null && stream.Length >= 2)
 12                {     
 13                    //保存文件流的前4个字节
 14                    byte byte1 = 0;
 15                    byte byte2 = 0;
 16                    byte byte3 = 0;
 17                    byte byte4 = 0;
 18                    //保存当前Seek位置
 19                    long origPos = stream.Seek(0, SeekOrigin.Begin);
 20                    stream.Seek(0, SeekOrigin.Begin);
 21                    int nByte = stream.ReadByte();
 22                    byte1 = Convert.ToByte(nByte);
 23                    byte2 = Convert.ToByte(stream.ReadByte());
 24                    if (stream.Length >= 3)
 25                    {
 26                        byte3 = Convert.ToByte(stream.ReadByte());
 27                    }
 28                    if (stream.Length >= 4)
 29                    {
 30                        byte4 = Convert.ToByte(stream.ReadByte());
 31                    }
 32
 33                    //根据文件流的前4个字节判断Encoding
 34                    //Unicode {0xFF, 0xFE};
 35                    //BE-Unicode {0xFE, 0xFF};
 36                    //UTF8 = {0xEF, 0xBB, 0xBF};
 37                    if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
 38                    {
 39                        targetEncoding = Encoding.BigEndianUnicode;
 40                    }
 41                    if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
 42                    {
 43                        targetEncoding = Encoding.Unicode;
 44                    }
 45                    if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
 46                    {
 47                        targetEncoding = Encoding.UTF8;
 48                    }
 49                    //恢复Seek位置       
 50                    stream.Seek(origPos, SeekOrigin.Begin);
 51                  
 52                }*/

 53            }

 54        }

 55        
 56        
 57        private int utf8_probability(byte[] rawtext)
 58        {
 59            int score = 0;
 60            int i, rawtextlen = 0;
 61            int goodbytes = 0, asciibytes = 0;
 62
 63            // Maybe also use UTF8 Byte Order Mark:  EF BB BF
 64
 65            // Check to see if characters fit into acceptable ranges
 66            rawtextlen = rawtext.Length;
 67            for (i = 0; i < rawtextlen; i++)
 68            {
 69                if ((rawtext[i] & (byte)0x7F== rawtext[i])
 70                {  // One byte
 71                    asciibytes++;
 72                    // Ignore ASCII, can throw off count
 73                }

 74                else
 75                {
 76                    int m_rawInt0 = Convert.ToInt16(rawtext[i]);
 77                    int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
 78                    int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
 79
 80                    if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
 81                     i + 1 < rawtextlen &&
 82                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
 83                    {
 84                        goodbytes += 2;
 85                        i++;
 86                    }

 87                    else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
 88                     i + 2 < rawtextlen &&
 89                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
 90                     256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
 91                    {
 92                        goodbytes += 3;
 93                        i += 2;
 94                    }

 95                }

 96            }

 97
 98            if (asciibytes == rawtextlen) return 0; }
 99
100            score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101
102            // If not above 98, reduce to zero to prevent coincidental matches
103            // Allows for some (few) bad formed sequences
104            if (score > 98)
105            {
106                return score;
107            }

108            else if (score > 95 && goodbytes > 30)
109            {
110                return score;
111            }

112            else
113            {
114                return 0;
115            }

116
117        }

posted on 2007-07-02 14:17  房客  阅读(1972)  评论(2编辑  收藏  举报

导航