判断文件的字符编码

首先，不同编码的文本，是根据文本的前两个字节来定义其编码格式的。定义如下：

  ANSI：　　　　　　　　无格式定义；
  Unicode：　　　　　　前两个字节为FFFE；
  Unicode big endian：　前两字节为FEFF；　
  UTF-8：　　　　　　　前两字节为EFBB；　

  知道了各种编码格式的区别，写代码就容易了.

 1 package charset;
 2 
 3 import java.io.BufferedInputStream;
 4 import java.io.File;
 5 import java.io.FileInputStream;
 6 
 7 public class demo {
 8     public static void main(String[] args) {
 9         String txtCharset=get_charset(new File("e:/1.TXT"));
10         System.out.println(txtCharset);
11     }
12     public static String get_charset(File file) {
13         String charset = "GBK";
14         byte[] first3Bytes = new byte[3];
15         try {
16             boolean checked=false;
17             BufferedInputStream bis = new BufferedInputStream(
18                     new FileInputStream(file));
19             bis.mark(0);
20             int read = bis.read(first3Bytes, 0, 3);
21             if (read == -1)
22                 return charset;
23             if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
24                 charset = "UTF-16LE";
25                 checked = true;
26             } else if (first3Bytes[0] == (byte) 0xFE
27                     && first3Bytes[1] == (byte) 0xFF) {
28                 charset = "UTF-16BE";
29                 checked = true;
30             } else if (first3Bytes[0] == (byte) 0xEF
31                     && first3Bytes[1] == (byte) 0xBB
32                     && first3Bytes[2] == (byte) 0xBF) {
33                 charset = "UTF-8";
34                 checked = true;
35             }
36             bis.reset();
37             if (!checked) {
38                 // int len = 0;
39                 int loc = 0;
40 
41                 while ((read = bis.read()) != -1) {
42                     loc++;
43                     if (read >= 0xF0)
44                         break;
45                     if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK
46                         break;
47                     if (0xC0 <= read && read <= 0xDF) {
48                         read = bis.read();
49                         if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
50                                                             // (0x80
51                                                             // - 0xBF),也可能在GB编码内
52                             continue;
53                         else
54                             break;
55                     } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是几率较小
56                         read = bis.read();
57                         if (0x80 <= read && read <= 0xBF) {
58                             read = bis.read();
59                             if (0x80 <= read && read <= 0xBF) {
60                                 charset = "UTF-8";
61                                 break;
62                             } else
63                                 break;
64                         } else
65                             break;
66                     }
67                 }
68                 // System.out.println( loc + " " + Integer.toHexString( read )
69                 // );
70             }
71 
72             bis.close();
73         } catch (Exception e) {
74             e.printStackTrace();
75         }
76 
77         return charset;
78     }
79 }

posted on 2014-08-20 18:07 腾飞工作室阅读(440) 评论(0) 编辑收藏举报