论文查重

代码

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import java.io.*;
import java.text.DecimalFormat;
import java.util.*;

public class Reader {
	public static double calculateCosSimilarity(Map<String, Integer> vec1, Map<String, Integer> vec2) {//余弦计算法
	    double dotProduct = 0.0;
	    double norm1 = 0.0;
	    double norm2 = 0.0;

	    for (Map.Entry<String, Integer> entry : vec1.entrySet()) {
	    	String word = entry.getKey();
	        int count = entry.getValue();
	        dotProduct += count * vec2.getOrDefault(word, 0);
	        norm1 += Math.pow(count, 2);
	    }

	    for (Map.Entry<String, Integer> entry : vec2.entrySet()) {
	        int count = entry.getValue();
	        norm2 += Math.pow(count, 2);
	    }
	    
	    System.out.println(dotProduct);
	    return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
	}

	
	

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		Scanner sc =new Scanner(System.in);
		String filePath1=sc.next();	//原文路径
		String filePath2=sc.next();	//需要判断的文章路径
		String filePath3=sc.next();
//		String filePath1 = "D:/1.txt";
//		String filePath2 = "D:/2.txt";
        StringBuilder content1 = new StringBuilder();
        StringBuilder content2 = new StringBuilder();
        String line;
        BufferedReader reader = null;
		
		reader = new BufferedReader(new FileReader(filePath1));  //将两篇文章读入
        while ((line = reader.readLine()) != null) {
            content1.append(line);
            content1.append(System.lineSeparator());
        }
        reader.close();
        reader = new BufferedReader(new FileReader(filePath2));
        while ((line = reader.readLine()) != null) {
            content2.append(line);
            content2.append(System.lineSeparator());
        }
        reader.close();
        String paper1= new String(content1);
        String paper2= new String(content2);
        System.out.println(paper1+"xx");
		System.out.println(paper2+"xx");
		List<Term> segList1 = HanLP.segment(paper1);  //用分词工具包对文章进行分词
		List<Term> segList2 = HanLP.segment(paper2);
		System.out.println("1"+segList1+"xx");
		System.out.println("2"+segList2+"xx");
		//String[] words1 = paper1.split(",");
		ArrayList<String> list1=new ArrayList<>();
		Map<String, Integer> wordCountMap1 = new HashMap<>();
		for(int i=0;i<segList1.size();i++) {
			list1.add(segList1.get(i).word);
		}
		for (String word : list1) {
		    wordCountMap1.put(word, wordCountMap1.getOrDefault(word, 0) + 1);
		}

		ArrayList<String> list2=new ArrayList<>();
		Map<String, Integer> wordCountMap2 = new HashMap<>();  //将每个分词存入Map,并对应相应的值
		for(int i=0;i<segList2.size();i++) {
			list2.add(segList2.get(i).word);
		}
		for (String word : list2) {
		    wordCountMap2.put(word, wordCountMap2.getOrDefault(word, 0) + 1);
		}
		
		
		double rate=calculateCosSimilarity(wordCountMap1, wordCountMap2);  //使用余弦计算法计算相似度
		System.out.println(new DecimalFormat("0.00").format(rate));
		BufferedWriter out = new BufferedWriter(new FileWriter(filePath3));
		out.write(new DecimalFormat("0.00").format(rate));
		out.close();
	}

}

分词工具包

我是使用hanlp分词工具
详细教程看这里

PSP表格

PSP2.1 Personal Software Process Stages 预估耗时(分钟) 实际耗时(分钟)
Planning 计划 20 20
Estimate 估计这个任务需要多少时间 10 10
Development 开发 10 10
Analysis 需求分析 (包括学习新技术) 10 10
Design Spec 生成设计文档 5 5
Design Review 设计复审 10 10
Coding Standard 代码规范 (为目前的开发制定合适的规范) 10 10
Design 具体设计 10 10
Coding 具体编码 30 30
Code Review 代码复审 20 20
Test 测试(自我测试,修改代码,提交修改) 20 20
Reporting 报告 10 10
Test Repor 测试报告 5 5
Size Measurement 计算工作量 10 10
Postmortem & Process Improvement Plan 事后总结, 并提出过程改进计划 10 10
合计 190 190

GitHub

链接

由于个人原因,分词工具配置文件无法打包在jar包中,所以程序没有打包成jar包,希望懂的人可以留言分享一下。