第一次个人编程作业

论文查重

代码

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import java.io.*;
import java.text.DecimalFormat;
import java.util.*;

public class Reader {
	public static double calculateCosSimilarity(Map<String, Integer> vec1, Map<String, Integer> vec2) {//余弦计算法
	    double dotProduct = 0.0;
	    double norm1 = 0.0;
	    double norm2 = 0.0;

	    for (Map.Entry<String, Integer> entry : vec1.entrySet()) {
	    	String word = entry.getKey();
	        int count = entry.getValue();
	        dotProduct += count * vec2.getOrDefault(word, 0);
	        norm1 += Math.pow(count, 2);
	    }

	    for (Map.Entry<String, Integer> entry : vec2.entrySet()) {
	        int count = entry.getValue();
	        norm2 += Math.pow(count, 2);
	    }
	    
	    System.out.println(dotProduct);
	    return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
	}

	
	

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		Scanner sc =new Scanner(System.in);
		String filePath1=sc.next();	//原文路径
		String filePath2=sc.next();	//需要判断的文章路径
		String filePath3=sc.next();
//		String filePath1 = "D:/1.txt";
//		String filePath2 = "D:/2.txt";
        StringBuilder content1 = new StringBuilder();
        StringBuilder content2 = new StringBuilder();
        String line;
        BufferedReader reader = null;
		
		reader = new BufferedReader(new FileReader(filePath1));  //将两篇文章读入
        while ((line = reader.readLine()) != null) {
            content1.append(line);
            content1.append(System.lineSeparator());
        }
        reader.close();
        reader = new BufferedReader(new FileReader(filePath2));
        while ((line = reader.readLine()) != null) {
            content2.append(line);
            content2.append(System.lineSeparator());
        }
        reader.close();
        String paper1= new String(content1);
        String paper2= new String(content2);
        System.out.println(paper1+"xx");
		System.out.println(paper2+"xx");
		List<Term> segList1 = HanLP.segment(paper1);  //用分词工具包对文章进行分词
		List<Term> segList2 = HanLP.segment(paper2);
		System.out.println("1"+segList1+"xx");
		System.out.println("2"+segList2+"xx");
		//String[] words1 = paper1.split("，");
		ArrayList<String> list1=new ArrayList<>();
		Map<String, Integer> wordCountMap1 = new HashMap<>();
		for(int i=0;i<segList1.size();i++) {
			list1.add(segList1.get(i).word);
		}
		for (String word : list1) {
		    wordCountMap1.put(word, wordCountMap1.getOrDefault(word, 0) + 1);
		}

		ArrayList<String> list2=new ArrayList<>();
		Map<String, Integer> wordCountMap2 = new HashMap<>();  //将每个分词存入Map，并对应相应的值
		for(int i=0;i<segList2.size();i++) {
			list2.add(segList2.get(i).word);
		}
		for (String word : list2) {
		    wordCountMap2.put(word, wordCountMap2.getOrDefault(word, 0) + 1);
		}
		
		
		double rate=calculateCosSimilarity(wordCountMap1, wordCountMap2);  //使用余弦计算法计算相似度
		System.out.println(new DecimalFormat("0.00").format(rate));
		BufferedWriter out = new BufferedWriter(new FileWriter(filePath3));
		out.write(new DecimalFormat("0.00").format(rate));
		out.close();
	}

}

分词工具包

我是使用hanlp分词工具
详细教程看这里

PSP表格

PSP2.1	Personal Software Process Stages	预估耗时（分钟）	实际耗时（分钟）
Planning	计划	20	20
Estimate	估计这个任务需要多少时间	10	10
Development	开发	10	10
Analysis	需求分析 (包括学习新技术)	10	10
Design Spec	生成设计文档	5	5
Design Review	设计复审	10	10
Coding Standard	代码规范 (为目前的开发制定合适的规范)	10	10
Design	具体设计	10	10
Coding	具体编码	30	30
Code Review	代码复审	20	20
Test	测试（自我测试，修改代码，提交修改）	20	20
Reporting	报告	10	10
Test Repor	测试报告	5	5
Size Measurement	计算工作量	10	10
Postmortem & Process Improvement Plan	事后总结, 并提出过程改进计划	10	10
	合计	190	190

GitHub

链接

由于个人原因，分词工具配置文件无法打包在jar包中，所以程序没有打包成jar包，希望懂的人可以留言分享一下。

发表于 2023-09-19 22:42 qybl37 阅读(69) 评论(0) 收藏举报

刷新页面返回顶部

第一次个人编程作业

论文查重

代码

分词工具包

PSP表格

GitHub

由于个人原因，分词工具配置文件无法打包在jar包中，所以程序没有打包成jar包，希望懂的人可以留言分享一下。

公告

导航