自动判断文本文件编码来读取文本文件内容(.net版本和java版本)
.net版本
using System; using System.IO; using System.Text; namespace G2.Common { /// <summary> /// 用于取得一个文本文件的编码方式(Encoding)。 /// </summary> public static class TextEncodingHelper { /// <summary> /// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符,Encoding.Default将被返回。 /// 文件的字符集在Windows下有两种,一种是ANSI,一种Unicode。 /// 对于Unicode,Windows支持了它的三种编码方式,一种是小尾编码(Unicode),一种是大尾编码(BigEndianUnicode),一种是UTF-8编码。 /// 我们可以从文件的头部来区分一个文件是属于哪种编码。当头部开始的两个字节为 FF FE时,是Unicode的小尾编码;当头部的两个字节为FE FF时,是Unicode的大尾编码;当头部两个字节为EF BB时,是Unicode的UTF-8编码;当它不为这些时,则是ANSI编码。 /// 按照如上所说,我们可以通过读取文件头的两个字节来判断文件的编码格式 /// </summary> /// <param name="filename">文件名。</param> /// <returns></returns> public static System.Text.Encoding GetFileEncoding(this string filename) { if (!File.Exists(filename)) { throw new Exception("文件"" + filename + ""不存在!"); } using (var fs = new System.IO.FileStream(filename, System.IO.FileMode.Open, System.IO.FileAccess.Read)) using (var br = new System.IO.BinaryReader(fs)) { var buffer = br.ReadBytes(2); if (buffer[0] >= 0xEF) { if (buffer[0] == 0xEF && buffer[1] == 0xBB) { return System.Text.Encoding.UTF8; } if (buffer[0] == 0xFE && buffer[1] == 0xFF) { return System.Text.Encoding.BigEndianUnicode; } if (buffer[0] == 0xFF && buffer[1] == 0xFE) { return System.Text.Encoding.Unicode; } } return GetEncodingWithBomUtf8(fs, System.Text.Encoding.Default); } } /// <summary> /// 通过给定的文件流,判断文件的编码类型 (解决了不带BOM的 UTF8 编码问题 ) /// </summary> /// <param name="fs">文件流</param> /// <param name="defaultEncoding">默认编码</param> /// <returns>文件的编码类型</returns> private static System.Text.Encoding GetEncodingWithBomUtf8(Stream fs, Encoding defaultEncoding) { byte[] unicode = new byte[] { 0xFF, 0xFE, 0x41 }; byte[] unicodeBig = new byte[] { 0xFE, 0xFF, 0x00 }; //带BOM byte[] utf8 = new byte[] { 0xEF, 0xBB, 0xBF }; var reVal = defaultEncoding; using (var r = new System.IO.BinaryReader(fs)) { byte[] ss = r.ReadBytes(4); if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00) { reVal = Encoding.BigEndianUnicode; } else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41) { reVal = Encoding.Unicode; } else { if (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF) { reVal = Encoding.UTF8; } else { int i; int.TryParse(fs.Length.ToString(), out i); ss = r.ReadBytes(i); if (IsUtf8Bytes(ss)) { reVal = Encoding.UTF8; } } } return reVal; } } /// <summary> /// 判断是否是不带 BOM 的 UTF8 格式 /// </summary> /// <param name="data"></param> /// <returns></returns> private static bool IsUtf8Bytes(byte[] data) { int charByteCounter = 1; //计算当前正分析的字符应还有的字节数 for (int i = 0; i < data.Length; i++) { var curByte = data[i]; //当前分析的字节. if (charByteCounter == 1) { if (curByte >= 0x80) { //判断当前 while (((curByte <<= 1) & 0x80) != 0) { charByteCounter++; } //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X if (charByteCounter == 1 || charByteCounter > 6) { return false; } } } else { //若是UTF-8 此时第一位必须为1 if ((curByte & 0xC0) != 0x80) { return false; } charByteCounter--; } } if (charByteCounter > 1) { throw new Exception("非预期的byte格式!"); } return true; } } }
java版本
import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.util.BitSet; public class EncodeUtils { private static final Logger logger = LoggerFactory.getLogger(EncodeUtils.class); private static final int BYTE_SIZE = 8; private static final String CODE_UTF8 = "UTF-8"; private static final String CODE_UTF16 = "UTF-16";//Unicode private static final String CODE_UTF16LE = "UTF-16LE";//Unicode big endian private static final String CODE_GBK = "GBK"; //ABSU /** * 通过文件全名称获取编码集名称 */ public static String getEncode(String fullFileName) throws Exception { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName)); return getEncode(bis, CODE_GBK); } /** * 通过文件全名称获取编码集名称 */ public static String getEncode(String fullFileName, String defaultEncoding) throws Exception { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName)); return getEncode(bis, defaultEncoding); } /** * 通过文件缓存流获取编码集名称,文件流必须为未曾 * * @param bis 文件流 */ public static String getEncode(BufferedInputStream bis, String defaultEncoding) throws Exception { bis.mark(0); String encodeType; byte[] head = new byte[3]; bis.read(head); if (head[0] == -1 && head[1] == -2 && head[2] == (byte) 0x41) { encodeType = CODE_UTF16; } else if (head[0] == -2 && head[1] == -1 && head[2] == 0) { //encodeType = "Unicode"; encodeType = CODE_UTF16LE; } else if (head[0] == -17 && head[1] == -69 && head[2] == -65) { //带BOM的UTF8 (CODE_UTF8_BOM) encodeType = CODE_UTF8; } else { if (isUTF8(bis)) { encodeType = CODE_UTF8; } else { encodeType = defaultEncoding; } } return encodeType; } /** * 是否是无BOM的UTF8格式,不判断常规场景,只区分无BOM UTF8和GBK */ private static boolean isUTF8(BufferedInputStream bis) throws Exception { bis.reset(); //读取第一个字节 int code = bis.read(); do { BitSet bitSet = convert2BitSet(code); //判断是否为单字节 if (bitSet.get(0)) {//多字节时,再读取N个字节 if (!checkMultiByte(bis, bitSet)) {//未检测通过,直接返回 return false; } } code = bis.read(); } while (code != -1); return true; } /** * 检测多字节,判断是否为utf8,已经读取了一个字节 */ private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception { int count = getCountOfSequential(bitSet); byte[] bytes = new byte[count - 1];//已经读取了一个字节,不能再读取 bis.read(bytes); for (byte b : bytes) { if (!checkUtf8Byte(b)) { return false; } } return true; } /** * 检测bitSet中从开始有多少个连续的1 */ private static int getCountOfSequential(BitSet bitSet) { int count = 0; for (int i = 0; i < BYTE_SIZE; i++) { if (bitSet.get(i)) { count++; } else { break; } } return count; } /** * 检测单字节,判断是否为utf8 */ private static boolean checkUtf8Byte(byte b) throws Exception { BitSet bitSet = convert2BitSet(b); return bitSet.get(0) && !bitSet.get(1); } /** * 将整形转为BitSet */ private static BitSet convert2BitSet(int code) { BitSet bitSet = new BitSet(BYTE_SIZE); for (int i = 0; i < BYTE_SIZE; i++) { int tmp3 = code >> (BYTE_SIZE - i - 1); int tmp2 = 0x1 & tmp3; if (tmp2 == 1) { bitSet.set(i); } } return bitSet; } public static void main(String[] args) { String filePath = "C:\\110025.txt"; try { String encoding = getEncode(filePath); System.out.println(encoding); } catch (Exception ex) { logger.warn("文件检测编码出错!", ex); } } }