利用Levenshtein Distance (编辑距离)实现文档相似度计算
1.首先将word文档解压缩为zip
/** * 修改后缀名 */ public static String reName(String path){ File file=new File(path); String filename=file.getAbsolutePath(); if(filename.indexOf(".")>=0){ filename=filename.substring(0,filename.lastIndexOf(".")); } file.renameTo(new File(filename+".zip")); return filename; } /** * 解压缩 */ public static File zipDeCompressing(String path){ long startTime=System.currentTimeMillis(); File Font=null; try{ ZipInputStream Zin=new ZipInputStream(new FileInputStream(reName(path)+".zip")); BufferedInputStream Bin=new BufferedInputStream(Zin); String Parent=reName(path); ZipEntry entry; try{ while((entry=Zin.getNextEntry())!=null&&!entry.isDirectory()){ Font=new File(Parent,entry.getName()); if(!Font.exists()){ (new File(Font.getParent())).mkdirs(); } FileOutputStream out=new FileOutputStream(Font); BufferedOutputStream Bout=new BufferedOutputStream(out); int b; while((b=Bin.read())!=-1){ Bout.write(b); } Bout.close(); out.close(); System.out.println(Font+"解压成功"); } Bin.close(); Zin.close(); }catch(Exception e){ e.printStackTrace(); } }catch(FileNotFoundException e){ e.printStackTrace(); } long endTime=System.currentTimeMillis(); System.out.println("耗费时间:"+(endTime-startTime)+"ms"); File file=new File(reName(path)+"/word/document.xml"); return file; }
2.利用Levenshtein Distance (编辑距离)计算文本相似度
private static int min(int one, int two, int three) { int min = one; if (two < min) { min = two; } if (three < min) { min = three; } return min; } public static int ld(String str1, String str2) { int d[][]; // 矩阵 int n = str1.length(); int m = str2.length(); int i; // 遍历str1的 int j; // 遍历str2的 char ch1; // str1的 char ch2; // str2的 int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1 if (n == 0) { return m; } if (m == 0) { return n; } d = new int[n + 1][m + 1]; for (i = 0; i <= n; i++) { // 初始化第一列 d[i][0] = i; } for (j = 0; j <= m; j++) { // 初始化第一行 d[0][j] = j; } for (i = 1; i <= n; i++) { // 遍历str1 ch1 = str1.charAt(i - 1); // 去匹配str2 for (j = 1; j <= m; j++) { ch2 = str2.charAt(j - 1); if (ch1 == ch2) { temp = 0; } else { temp = 1; } // 左边+1,上边+1, 左上角+temp取最小 d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]+ temp); } } return d[n][m]; } public static double sim(String str1, String str2) { try { double ld = (double)ld(str1, str2); return (1-ld/(double)Math.max(str1.length(), str2.length())); } catch (Exception e) { return 0.1; } }
源码下载地址:http://download.csdn.net/detail/xiangrikuigt/9696149