判断文件的字符编码
首先,不同编码的文本,是根据文本的前两个字节来定义其编码格式的。定义如下:
ANSI: 无格式定义;
Unicode: 前两个字节为FFFE;
Unicode big endian: 前两字节为FEFF;
UTF-8: 前两字节为EFBB;
知道了各种编码格式的区别,写代码就容易了.
1 package charset; 2 3 import java.io.BufferedInputStream; 4 import java.io.File; 5 import java.io.FileInputStream; 6 7 public class demo { 8 public static void main(String[] args) { 9 String txtCharset=get_charset(new File("e:/1.TXT")); 10 System.out.println(txtCharset); 11 } 12 public static String get_charset(File file) { 13 String charset = "GBK"; 14 byte[] first3Bytes = new byte[3]; 15 try { 16 boolean checked=false; 17 BufferedInputStream bis = new BufferedInputStream( 18 new FileInputStream(file)); 19 bis.mark(0); 20 int read = bis.read(first3Bytes, 0, 3); 21 if (read == -1) 22 return charset; 23 if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { 24 charset = "UTF-16LE"; 25 checked = true; 26 } else if (first3Bytes[0] == (byte) 0xFE 27 && first3Bytes[1] == (byte) 0xFF) { 28 charset = "UTF-16BE"; 29 checked = true; 30 } else if (first3Bytes[0] == (byte) 0xEF 31 && first3Bytes[1] == (byte) 0xBB 32 && first3Bytes[2] == (byte) 0xBF) { 33 charset = "UTF-8"; 34 checked = true; 35 } 36 bis.reset(); 37 if (!checked) { 38 // int len = 0; 39 int loc = 0; 40 41 while ((read = bis.read()) != -1) { 42 loc++; 43 if (read >= 0xF0) 44 break; 45 if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK 46 break; 47 if (0xC0 <= read && read <= 0xDF) { 48 read = bis.read(); 49 if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF) 50 // (0x80 51 // - 0xBF),也可能在GB编码内 52 continue; 53 else 54 break; 55 } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小 56 read = bis.read(); 57 if (0x80 <= read && read <= 0xBF) { 58 read = bis.read(); 59 if (0x80 <= read && read <= 0xBF) { 60 charset = "UTF-8"; 61 break; 62 } else 63 break; 64 } else 65 break; 66 } 67 } 68 // System.out.println( loc + " " + Integer.toHexString( read ) 69 // ); 70 } 71 72 bis.close(); 73 } catch (Exception e) { 74 e.printStackTrace(); 75 } 76 77 return charset; 78 } 79 }