利用Levenshtein Distance (编辑距离)实现文档相似度计算

1.首先将word文档解压缩为zip

    /**
     * 修改后缀名
     */
    public static String reName(String path){
        File file=new File(path);
        String filename=file.getAbsolutePath();
        if(filename.indexOf(".")>=0){
            filename=filename.substring(0,filename.lastIndexOf("."));
        }
        file.renameTo(new File(filename+".zip"));
        return filename;
    }
    
    /**
     * 解压缩
     */
    public static File zipDeCompressing(String path){
        long startTime=System.currentTimeMillis();
        File Font=null;
        try{
            ZipInputStream Zin=new ZipInputStream(new FileInputStream(reName(path)+".zip"));
            BufferedInputStream Bin=new BufferedInputStream(Zin);
            
            String Parent=reName(path);
            ZipEntry entry;
            try{
                while((entry=Zin.getNextEntry())!=null&&!entry.isDirectory()){
                    Font=new File(Parent,entry.getName());
                    if(!Font.exists()){
                        (new File(Font.getParent())).mkdirs();
                    }
                    FileOutputStream out=new FileOutputStream(Font);
                    BufferedOutputStream Bout=new BufferedOutputStream(out);
                    int b;
                    while((b=Bin.read())!=-1){
                        Bout.write(b);
                    }
                    Bout.close();
                    out.close();
                    System.out.println(Font+"解压成功");
                }
                Bin.close();
                Zin.close();
            }catch(Exception e){
                e.printStackTrace();
            }
        }catch(FileNotFoundException e){
            e.printStackTrace();
        }
        long endTime=System.currentTimeMillis();
        System.out.println("耗费时间:"+(endTime-startTime)+"ms");
        File file=new File(reName(path)+"/word/document.xml");
        return file;
    }

2.利用Levenshtein Distance (编辑距离)计算文本相似度

private static int min(int one, int two, int three) {
        int min = one;
        if (two < min) {
            min = two;
        }
        if (three < min) {
            min = three;
        }
        return min;
    }
 
    public static int ld(String str1, String str2) {
        int d[][]; // 矩阵
        int n = str1.length();
        int m = str2.length();
        int i; // 遍历str1的
        int j; // 遍历str2的
        char ch1; // str1的
        char ch2; // str2的
        int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
        if (n == 0) {
            return m;
        }
        if (m == 0) {
            return n;
        }
        d = new int[n + 1][m + 1];
        for (i = 0; i <= n; i++) { // 初始化第一列
            d[i][0] = i;
        }
        for (j = 0; j <= m; j++) { // 初始化第一行
            d[0][j] = j;
        }
        for (i = 1; i <= n; i++) { // 遍历str1
            ch1 = str1.charAt(i - 1);
            // 去匹配str2
            for (j = 1; j <= m; j++) {
                ch2 = str2.charAt(j - 1);
                if (ch1 == ch2) {
                    temp = 0;
                } else {
                    temp = 1;
                }
                // 左边+1,上边+1, 左上角+temp取最小
                d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]+ temp);
            }
        }
        return d[n][m];
    }
    public static double sim(String str1, String str2) {
        try {
            double ld = (double)ld(str1, str2);
            return (1-ld/(double)Math.max(str1.length(), str2.length()));
        } catch (Exception e) {
            return 0.1;
        }
    }

 源码下载地址:http://download.csdn.net/detail/xiangrikuigt/9696149

posted @ 2016-11-28 10:01  eunicer  阅读(505)  评论(0编辑  收藏  举报