Java究竟怎么玩?

天地程序已定棋,人间大数待变局

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
源码下载地址:http://download.csdn.net/source/414086

汉字编码是一项较为麻烦的事情,弄不好就会造出些谁都看不懂的乱码。比如我想做个针对汉字网站的爬虫系统,需要对非特定的页面进行数据解析处理,而此时我所访问的页面编码格式未知,如果不能正确处理页面编码,则很难获得我们理想中的数据。

通常这时候可能有几种选择:

一是根据response的ContentType获得,如果服务器支持的话此项中会返回charset数值,解析即可。但对不返回或者不支持的服务器则无能为力。

二是使用正则或自定义解析函数截取页面中‘charset=’后的数据,采取死钉战术,但万一采集的页面中没有此项或者此项有错,也就回天乏术。

三就是老老实实的解析全文,最后返回一个符合的编码格式。

此例中我演示了几种较常见编码的识别方法,通过统计编码为指定编码的或然率, 而后返回可能性最高的编码方式。在无法获得确切编码之时,这可说是一种唯一的选择。

这种识别方式主要是针对汉字编码而来,所以对应页面中的汉字数目越多,统计结果就越准确,反之则很难识别出正确结果。


Encoding.java
package org.loon.test.encoding;

/**
 * <p>
 * Title: LoonFramework
 * </p>
 * <p>
 * Description:编码基本类型集合
 * </p>
 * <p>
 * Copyright: Copyright (c) 2008
 * </p>
 * <p>
 * Company: LoonFramework
 * </p>
 * <p>
 * License: 
http://www.apache.org/licenses/LICENSE-2.0
 * </p>
 * 
 * 
@author chenpeng
 * @email:ceponline@yahoo.com.cn
 * 
@version 0.1
 
*/

public class Encoding {

    
// 支持的字符格式
    public static int GB2312 = 0;

    
public static int GBK = 1;
    
    
public static int BIG5 = 2;

    
public static int UTF8 = 3;

    
public static int UNICODE = 4;

    
public static int EUC_KR = 5;

    
public static int SJIS = 6;

    
public static int EUC_JP = 7;

    
public static int ASCII = 8;

    
public static int UNKNOWN = 9;

    
public static int TOTALT = 10;

    
public final static int SIMP = 0;

    
public final static int TRAD = 1;

    
// 解析名称用
    public static String[] javaname;

    
// 编码用
    public static String[] nicename;

    
// 应用于html中的字符集
    public static String[] htmlname;

    
public Encoding() {
        javaname 
= new String[TOTALT];
        nicename 
= new String[TOTALT];
        htmlname 
= new String[TOTALT];
        javaname[GB2312] 
= "GB2312";
        javaname[GBK] 
= "GBK";
        javaname[BIG5] 
= "BIG5";
        javaname[UTF8] 
= "UTF8";
        javaname[UNICODE] 
= "Unicode";
        javaname[EUC_KR] 
= "EUC_KR";
        javaname[SJIS] 
= "SJIS";
        javaname[EUC_JP] 
= "EUC_JP";
        javaname[ASCII] 
= "ASCII";
        javaname[UNKNOWN] 
= "ISO8859_1";

        
// 分配编码名称
        htmlname[GB2312] = "GB2312";
        htmlname[GBK] 
= "GBK";
        htmlname[BIG5] 
= "BIG5";
        htmlname[UTF8] 
= "UTF-8";
        htmlname[UNICODE] 
= "UTF-16";
        htmlname[EUC_KR] 
= "EUC-KR";
        htmlname[SJIS] 
= "Shift_JIS";
        htmlname[EUC_JP] 
= "EUC-JP";
        htmlname[ASCII] 
= "ASCII";
        htmlname[UNKNOWN] 
= "ISO8859-1";

        
// 分配可读名称
        nicename[GB2312] = "GB-2312";
        nicename[GBK] 
= "GBK";
        nicename[BIG5] 
= "Big5";
        nicename[UTF8] 
= "UTF-8";
        nicename[UNICODE] 
= "Unicode";
        nicename[EUC_KR] 
= "EUC-KR";
        nicename[SJIS] 
= "Shift-JIS";
        nicename[EUC_JP] 
= "EUC-JP";
        nicename[ASCII] 
= "ASCII";
        nicename[UNKNOWN] 
= "UNKNOWN";

    }


    
public String toEncoding(final int type) {
        
return (javaname[type] + "," + nicename[type] + "," + htmlname[type])
                .intern();
    }



}


Encode,java(省略,见源码)

ParseEncoding.java
package org.loon.test.encoding;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;

/**
 * <p>
 * Title: LoonFramework
 * </p>
 * <p>
 * Description:
 * </p>
 * <p>
 * Copyright: Copyright (c) 2008
 * </p>
 * <p>
 * Company: LoonFramework
 * </p>
 * <p>
 * License: 
http://www.apache.org/licenses/LICENSE-2.0
 * </p>
 * 
 * 
@author chenpeng
 * @email:ceponline@yahoo.com.cn
 * 
@version 0.1
 
*/

public class ParseEncoding extends Encode {

    
public ParseEncoding() {
        
super();
        GB2312format 
= new int[94][94];
        GBKformat 
= new int[126][191];
        Big5format 
= new int[94][158];
        EUC_KRformat 
= new int[94][94];
        JPformat 
= new int[94][94];

        
// 初始化编码格式
        init();
    }


    
public String getEncoding(final String path) {
        
return check(getEncodeValue(path));
    }


    
public String getEncoding(final InputStream in) {
        
return check(getEncodeValue(in));
    }


    
public String getEncoding(final byte[] buffer) {
        
return check(getEncodeValue(buffer));
    }


    
public String getEncoding(final URL url) {
        
return check(getEncodeValue(url));
    }


    
private String check(final int result) {
        
if (result == -1{
            
return nicename[UNKNOWN];
        }

        
return nicename[result];
    }


    
/**
     * 解析指定字符串路径编码所用格式
     * 
     * 
@param path
     * 
@return
     
*/

    
private int getEncodeValue(String path) {
        
int express = UNKNOWN;
        
if (path.startsWith("http://")) {
            
try {
                express 
= getEncodeValue(new URL(path));
            }
 catch (MalformedURLException e) {
                express 
= -1;
            }

        }
 else {
            express 
= getEncodeValue(new File(path));
        }

        
return express;
    }


    
/**
     * 
     * 解析指定InputStream所用编码,返回或然率最高的编码类型数值
     * 
     * 
@param in
     * 
@return
     
*/

    
public int getEncodeValue(InputStream in) {
        
byte[] rawtext = new byte[8192];
        
int bytesread = 0, byteoffset = 0;
        
int express = UNKNOWN;
        InputStream stream 
= in;
        
try {
            
while ((bytesread = stream.read(rawtext, byteoffset, rawtext.length
                    
- byteoffset)) > 0{
                byteoffset 
+= bytesread;
            }

            ;
            stream.close();
            express 
= getEncodeValue(rawtext);
        }
 catch (Exception e) {
            express 
= -1;
        }

        
return express;
    }


    
/**
     * 解析指定url下数据所用编码,返回或然率最高的编码类型数值
     * 
     * 
@param url
     * 
@return
     
*/

    
public int getEncodeValue(URL url) {

        InputStream stream;
        
try {
            stream 
= url.openStream();
        }
 catch (IOException e) {
            stream 
= null;
        }


        
return getEncodeValue(stream);
    }


    
/**
     * 解析指定file所用编码,返回或然率最高的编码类型数值
     * 
     * 
@param file
     * 
@return
     
*/

    
public int getEncodeValue(File file) {
        
byte[] buffer;
        
try {
            buffer 
= read(new FileInputStream(file));
        }
 catch (FileNotFoundException e) {
            buffer 
= null;
        }

        
return getEncodeValue(buffer);
    }


    
/**
     * 将inputstream转为byte[]
     * 
     * 
@param inputStream
     * 
@return
     
*/

    
private final byte[] read(final InputStream inputStream) {
        
byte[] arrayByte = null;
        ByteArrayOutputStream byteArrayOutputStream 
= new ByteArrayOutputStream();
        
byte[] bytes = new byte[8192];
        
try {
            bytes 
= new byte[inputStream.available()];
            
int read;
            
while ((read = inputStream.read(bytes)) >= 0{
                byteArrayOutputStream.write(bytes, 
0, read);
            }

            arrayByte 
= byteArrayOutputStream.toByteArray();
        }
 catch (IOException e) {
            
return null;
        }

        
return arrayByte;
    }


    
/**
     * 解析指定byte[]所用编码,返回或然率最高的数值类型
     * 
     * 
@param content
     * 
@return
     
*/

    
public int getEncodeValue(byte[] content) {
        
if (content == null)
            
return -1;
        
int[] scores;
        
int index, maxscore = 0;
        
int encoding = UNKNOWN;
        scores 
= new int[TOTALT];
        
// 分配或然率
        scores[GB2312] = gb2312probability(content);
        scores[GBK] 
= gbkprobability(content);
        scores[BIG5] 
= big5probability(content);
        scores[UTF8] 
= utf8probability(content);
        scores[UNICODE] 
= utf16probability(content);
        scores[EUC_KR] 
= euc_krprobability(content);
        scores[ASCII] 
= asciiprobability(content);
        scores[SJIS] 
= sjisprobability(content);
        scores[EUC_JP] 
= euc_jpprobability(content);
        scores[UNKNOWN] 
= 0;

        
// 概率比较
        for (index = 0; index < TOTALT; index++{
            
if (scores[index] > maxscore) {
                
// 索引
                encoding = index;
                
// 最大几率
                maxscore = scores[index];
            }

        }

        
// 返回或然率大于50%的数据
        if (maxscore <= 50{
            encoding 
= UNKNOWN;
        }

        
return encoding;
    }


    
/**
     * gb2312数据或然率计算
     * 
     * 
@param content
     * 
@return
     
*/

    
private int gb2312probability(byte[] content) {
        
int i, rawtextlen = 0;

        
int dbchars = 1, gbchars = 1;
        
long gbformat = 0, totalformat = 1;
        
float rangeval = 0, formatval = 0;
        
int row, column;

        
// 检查是否在亚洲汉字范围内
        rawtextlen = content.length;
        
for (i = 0; i < rawtextlen - 1; i++{
            
if (content[i] >= 0{
            }
 else {
                dbchars
++;
                
// 汉字GB码由两个字节组成,每个字节的范围是0xA1 ~ 0xFE
                if ((byte0xA1 <= content[i] && content[i] <= (byte0xF7
                        
&& (byte0xA1 <= content[i + 1]
                        
&& content[i + 1<= (byte0xFE{
                    gbchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256 - 0xA1;
                    column 
= content[i + 1+ 256 - 0xA1;
                    
if (GB2312format[row][column] != 0{
                        gbformat 
+= GB2312format[row][column];
                    }
 else if (15 <= row && row < 55{
                        
// 在gb编码范围
                        gbformat += 200;
                    }


                }

                i
++;
            }

        }

        rangeval 
= 50 * ((float) gbchars / (float) dbchars);
        formatval 
= 50 * ((float) gbformat / (float) totalformat);

        
return (int) (rangeval + formatval);
    }


    
/**
     * gb2312或然率计算
     * 
     * 
@param content
     * 
@return
     
*/

    
private int gbkprobability(byte[] content) {
        
int i, rawtextlen = 0;

        
int dbchars = 1, gbchars = 1;
        
long gbformat = 0, totalformat = 1;
        
float rangeval = 0, formatval = 0;
        
int row, column;
        rawtextlen 
= content.length;
        
for (i = 0; i < rawtextlen - 1; i++{
            
if (content[i] >= 0{
            }
 else {
                dbchars
++;
                
if ((byte0xA1 <= content[i] && content[i] <= (byte0xF7
                        
&& // gb范围
                        (byte0xA1 <= content[i + 1]
                        
&& content[i + 1<= (byte0xFE{
                    gbchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256 - 0xA1;
                    column 
= content[i + 1+ 256 - 0xA1;
                    
if (GB2312format[row][column] != 0{
                        gbformat 
+= GB2312format[row][column];
                    }
 else if (15 <= row && row < 55{
                        gbformat 
+= 200;
                    }


                }
 else if ((byte0x81 <= content[i]
                        
&& content[i] <= (byte0xFE && // gb扩展区域
                        (((byte0x80 <= content[i + 1&& content[i + 1<= (byte0xFE|| ((byte0x40 <= content[i + 1&& content[i + 1<= (byte0x7E))) {
                    gbchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256 - 0x81;
                    
if (0x40 <= content[i + 1&& content[i + 1<= 0x7E{
                        column 
= content[i + 1- 0x40;
                    }
 else {
                        column 
= content[i + 1+ 256 - 0x40;
                    }

                    
if (GBKformat[row][column] != 0{
                        gbformat 
+= GBKformat[row][column];
                    }

                }

                i
++;
            }

        }

        rangeval 
= 50 * ((float) gbchars / (float) dbchars);
        formatval 
= 50 * ((float) gbformat / (float) totalformat);
        
return (int) (rangeval + formatval) - 1;
    }


    
/**
     * 解析为big5的或然率
     * 
     * 
@param content
     * 
@return
     
*/

    
private int big5probability(byte[] content) {
        
int i, rawtextlen = 0;
        
int dbchars = 1, bfchars = 1;
        
float rangeval = 0, formatval = 0;
        
long bfformat = 0, totalformat = 1;
        
int row, column;
        rawtextlen 
= content.length;
        
for (i = 0; i < rawtextlen - 1; i++{
            
if (content[i] >= 0{
            }
 else {
                dbchars
++;
                
if ((byte0xA1 <= content[i]
                        
&& content[i] <= (byte0xF9
                        
&& (((byte0x40 <= content[i + 1&& content[i + 1<= (byte0x7E|| ((byte0xA1 <= content[i + 1&& content[i + 1<= (byte0xFE))) {
                    bfchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256 - 0xA1;
                    
if (0x40 <= content[i + 1&& content[i + 1<= 0x7E{
                        column 
= content[i + 1- 0x40;
                    }
 else {
                        column 
= content[i + 1+ 256 - 0x61;
                    }

                    
if (Big5format[row][column] != 0{
                        bfformat 
+= Big5format[row][column];
                    }
 else if (3 <= row && row <= 37{
                        bfformat 
+= 200;
                    }

                }

                i
++;
            }

        }

        rangeval 
= 50 * ((float) bfchars / (float) dbchars);
        formatval 
= 50 * ((float) bfformat / (float) totalformat);

        
return (int) (rangeval + formatval);
    }


    
/**
     * 在utf-8中的或然率
     * 
     * 
@param content
     * 
@return
     
*/

    
private int utf8probability(byte[] content) {
        
int score = 0;
        
int i, rawtextlen = 0;
        
int goodbytes = 0, asciibytes = 0;
        
// 检查是否为汉字可接受范围
        rawtextlen = content.length;
        
for (i = 0; i < rawtextlen; i++{
            
if ((content[i] & (byte0x7F== content[i]) {
                asciibytes
++;
            }
 else if (-64 <= content[i] && content[i] <= -33
                    
&& i + 1 < rawtextlen && -128 <= content[i + 1]
                    
&& content[i + 1<= -65{
                goodbytes 
+= 2;
                i
++;
            }
 else if (-32 <= content[i] && content[i] <= -17
                    
&& i + 2 < rawtextlen && -128 <= content[i + 1]
                    
&& content[i + 1<= -65 && -128 <= content[i + 2]
                    
&& content[i + 2<= -65{
                goodbytes 
+= 3;
                i 
+= 2;
            }

        }


        
if (asciibytes == rawtextlen) {
            
return 0;
        }


        score 
= (int) (100 * ((float) goodbytes / (float) (rawtextlen - asciibytes)));
        
// 如果不高于98则减少到零
        if (score > 98{
            
return score;
        }
 else if (score > 95 && goodbytes > 30{
            
return score;
        }
 else {
            
return 0;
        }


    }


    
/**
     * 检查为utf-16的或然率
     * 
     * 
@param content
     * 
@return
     
*/

    
private int utf16probability(byte[] content) {

        
if (content.length > 1
                
&& ((byte0xFE == content[0&& (byte0xFF == content[1])
                
|| ((byte0xFF == content[0&& (byte0xFE == content[1])) {
            
return 100;
        }

        
return 0;
    }


    
/**
     * 检查为ascii的或然率
     * 
     * 
@param content
     * 
@return
     
*/

    
private int asciiprobability(byte[] content) {
        
int score = 75;
        
int i, rawtextlen;

        rawtextlen 
= content.length;

        
for (i = 0; i < rawtextlen; i++{
            
if (content[i] < 0{
                score 
= score - 5;
            }
 else if (content[i] == (byte0x1B// ESC (used by ISO 2022)
                score = score - 5;
            }

            
if (score <= 0{
                
return 0;
            }

        }

        
return score;
    }


    
/**
     * 检查为euc_kr的或然率
     * 
     * 
@param content
     * 
@return
     
*/

    
private int euc_krprobability(byte[] content) {
        
int i, rawtextlen = 0;

        
int dbchars = 1, krchars = 1;
        
long krformat = 0, totalformat = 1;
        
float rangeval = 0, formatval = 0;
        
int row, column;
        rawtextlen 
= content.length;
        
for (i = 0; i < rawtextlen - 1; i++{
            
if (content[i] >= 0{
            }
 else {
                dbchars
++;
                
if ((byte0xA1 <= content[i] && content[i] <= (byte0xFE
                        
&& (byte0xA1 <= content[i + 1]
                        
&& content[i + 1<= (byte0xFE{
                    krchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256 - 0xA1;
                    column 
= content[i + 1+ 256 - 0xA1;
                    
if (EUC_KRformat[row][column] != 0{
                        krformat 
+= EUC_KRformat[row][column];
                    }
 else if (15 <= row && row < 55{
                        krformat 
+= 0;
                    }


                }

                i
++;
            }

        }

        rangeval 
= 50 * ((float) krchars / (float) dbchars);
        formatval 
= 50 * ((float) krformat / (float) totalformat);

        
return (int) (rangeval + formatval);
    }


    
private int euc_jpprobability(byte[] content) {
        
int i, rawtextlen = 0;

        
int dbchars = 1, jpchars = 1;
        
long jpformat = 0, totalformat = 1;
        
float rangeval = 0, formatval = 0;
        
int row, column;

        rawtextlen 
= content.length;
        
for (i = 0; i < rawtextlen - 1; i++{
            
if (content[i] >= 0{
            }
 else {
                dbchars
++;
                
if ((byte0xA1 <= content[i] && content[i] <= (byte0xFE
                        
&& (byte0xA1 <= content[i + 1]
                        
&& content[i + 1<= (byte0xFE{
                    jpchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256 - 0xA1;
                    column 
= content[i + 1+ 256 - 0xA1;
                    
if (JPformat[row][column] != 0{
                        jpformat 
+= JPformat[row][column];
                    }
 else if (15 <= row && row < 55{
                        jpformat 
+= 0;
                    }


                }

                i
++;
            }

        }

        rangeval 
= 50 * ((float) jpchars / (float) dbchars);
        formatval 
= 50 * ((float) jpformat / (float) totalformat);

        
return (int) (rangeval + formatval);
    }


    
private int sjisprobability(byte[] content) {
        
int i, rawtextlen = 0;

        
int dbchars = 1, jpchars = 1;
        
long jpformat = 0, totalformat = 1;
        
float rangeval = 0, formatval = 0;
        
int row, column, adjust;

        rawtextlen 
= content.length;
        
for (i = 0; i < rawtextlen - 1; i++{
            
if (content[i] >= 0{
            }
 else {
                dbchars
++;
                
if (i + 1 < content.length
                        
&& (((byte0x81 <= content[i] && content[i] <= (byte0x9F|| ((byte0xE0 <= content[i] && content[i] <= (byte0xEF))
                        
&& (((byte0x40 <= content[i + 1&& content[i + 1<= (byte0x7E|| ((byte0x80 <= content[i + 1&& content[i + 1<= (byte0xFC))) {
                    jpchars
++;
                    totalformat 
+= 500;
                    row 
= content[i] + 256;
                    column 
= content[i + 1+ 256;
                    
if (column < 0x9f{
                        adjust 
= 1;
                        
if (column > 0x7f{
                            column 
-= 0x20;
                        }
 else {
                            column 
-= 0x19;
                        }

                    }
 else {
                        adjust 
= 0;
                        column 
-= 0x7e;
                    }

                    
if (row < 0xa0{
                        row 
= ((row - 0x70<< 1- adjust;
                    }
 else {
                        row 
= ((row - 0xb0<< 1- adjust;
                    }


                    row 
-= 0x20;
                    column 
= 0x20;
                    
if (row < JPformat.length && column < JPformat[row].length
                            
&& JPformat[row][column] != 0{
                        jpformat 
+= JPformat[row][column];
                    }

                    i
++;
                }
 else if ((byte0xA1 <= content[i]
                        
&& content[i] <= (byte0xDF{
                }


            }

        }

        rangeval 
= 50 * ((float) jpchars / (float) dbchars);
        formatval 
= 50 * ((float) jpformat / (float) totalformat);

        
return (int) (rangeval + formatval) - 1;
    }


}



EncodingTest.java
package org.loon.test.encoding;
/**
 * <p>Title: LoonFramework</p>
 * <p>Description:</p>
 * <p>Copyright: Copyright (c) 2008</p>
 * <p>Company: LoonFramework</p>
 * <p>License: 
http://www.apache.org/licenses/LICENSE-2.0</p>
 * 
@author chenpeng  
 * @email:ceponline@yahoo.com.cn 
 * 
@version 0.1
 
*/

public class EncodingTest {
    
public static void main(String argc[]) {
        ParseEncoding parse;

        parse 
= new ParseEncoding();
        
         System.out.println(
"中国大陆:");
         System.out.println(
"测试字符串,编码格式="+parse.getEncoding("百度".getBytes()));
         System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://www.baidu.com"));
         System.out.println();
         System.out.println(
"中国台湾:");
         System.out.println(
"测试字符串,编码格式="+parse.getEncoding("い地チ瓣".getBytes()));
         System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://tw.yahoo.com/"));
         System.out.println(
"测试站点(繁体字,UTF编码),编码格式="+parse.getEncoding("http://www.javaworld.com.tw/jute"));
         System.out.println();
         System.out.println(
"日本:");
         System.out.println(
"测试字符串,编码格式="+parse.getEncoding("その機能".getBytes()));
         System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://www.4gamer.net"));
         System.out.println();
         System.out.println(
"自称蚩尤后代那群……:");
         System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://www.easyjava.co.kr/"));
        
    }

}



输出结果:



posted on 2008-04-15 01:07  cping  阅读(2237)  评论(0编辑  收藏  举报