检测编码

public static Encoding determineEncoding(RandomAccessFile file) {
	    Encoding enc = Encoding.GBK;
	    try {
		    file.seek(0);
		    if(file.length() < 3) return enc;
		    byte[] bom = new byte[3]; //byte order mark
		    file.read(bom);
		    
		    if((bom[0] & 0XFF) == 0xFF && (bom[1] & 0XFF) == 0xFE) 
		    	enc = Encoding.UTF16LE;
		    else if((bom[0] & 0XFF) == 0xFE && (bom[1] & 0XFF) == 0xFF) 
		    	enc = Encoding.UTF16BE;
		    else if((bom[0] & 0XFF) == 0xEF && (bom[1] & 0XFF) == 0xBB && (bom[2] & 0XFF) == 0xBF) 
		    	enc = Encoding.UTF8;
		    else {//test if the file is encoded using GBK or BIG5 character set
		        int gbkCount = 0;
		        int big5Count = 0;
		        int utf16leCount = 0;
		        int utf16beCount = 0;
		        int utf8Count = 0;
		        
		        file.seek(0);
		        byte[] bs = new byte[4096];
		        file.read(bs);
		        int len = bs.length - 2;
		        //look up the Chinese characters "�?
		        for(int i = 0; i < len; ++i) {
		        	if((bs[i] & 0xFF) == 0xB5 && (bs[i + 1] & 0xFF) == 0xC4) {
		        		++gbkCount;
		        		++i;
		        	} else if ((bs[i] & 0xFF) == 0xE7 && (bs[i + 1] & 0xFF) == 0x9A && (bs[i + 2] & 0xFF) == 0x84) {
		        		++utf8Count;
		                i += 2;
		        	} else if ((bs[i] & 0xFF) == 0x84 && (bs[i + 1] & 0xFF) == 0x76) {
		        		++utf16leCount;
		        		++i;
		        	} else if ((bs[i] & 0xFF) == 0x76 && (bs[i + 1] & 0xFF) == 0x84) {
		        		++utf16beCount;
		        		++i;
		        	} else if ((bs[i] & 0xFF) == 0xAA && (bs[i + 1] & 0xFF) == 0xBA) {
		        		++big5Count;
		        		++i;
		        	}
		        }       
	
		        if(gbkCount > utf8Count && gbkCount > big5Count && gbkCount > utf16leCount && gbkCount > utf16beCount) 
		        	enc = Encoding.GBK;
		        else if(utf8Count > gbkCount && utf8Count > big5Count && utf8Count > utf16leCount && utf8Count > utf16beCount)
		        	enc = Encoding.UTF8;
		        else if(utf16leCount > gbkCount && utf16leCount > big5Count && utf16leCount > utf8Count && utf16leCount > utf16beCount)
		        	enc = Encoding.UTF16LE;
		        else if(utf16beCount > gbkCount && utf16beCount > big5Count && utf16beCount > utf16leCount && utf16beCount > utf16leCount)
		        	enc = Encoding.UTF16BE;
		        else if(big5Count > gbkCount && big5Count > utf8Count && big5Count > utf16leCount && big5Count > utf16beCount)
		        	enc = Encoding.BIG5;
		    }
	    } catch (Exception ex) {
	    	Log.e("File ERROR", "encoding detection failed.");
	    }
	    return enc;
	}
	


 

public enum Encoding {
	GBK("GBK"),
	BIG5("BIG5"),
	UTF8("UTF-8"),
	UTF16BE("UTF-16BE"),
	UTF16LE("UTF-16LE"),
	UNKNOWN("UNKNOWN");
	
	private Encoding (String name) {
		this.name = name;
		try {
			maxCharLength = "中".getBytes(name).length;
		} catch (Exception e) {}
	}
	
	private String name;
	public String getName() {
		return name;
	}
	
	private int maxCharLength;
	public int getMaxCharLength() {
		return maxCharLength;
	}
}


 

 

posted @ 2013-04-07 20:12  javawebsoa  Views(195)  Comments(0Edit  收藏  举报