论文查重
代码
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
public class Reader {
public static double calculateCosSimilarity(Map<String, Integer> vec1, Map<String, Integer> vec2) {//余弦计算法
double dotProduct = 0.0;
double norm1 = 0.0;
double norm2 = 0.0;
for (Map.Entry<String, Integer> entry : vec1.entrySet()) {
String word = entry.getKey();
int count = entry.getValue();
dotProduct += count * vec2.getOrDefault(word, 0);
norm1 += Math.pow(count, 2);
}
for (Map.Entry<String, Integer> entry : vec2.entrySet()) {
int count = entry.getValue();
norm2 += Math.pow(count, 2);
}
System.out.println(dotProduct);
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
Scanner sc =new Scanner(System.in);
String filePath1=sc.next(); //原文路径
String filePath2=sc.next(); //需要判断的文章路径
String filePath3=sc.next();
// String filePath1 = "D:/1.txt";
// String filePath2 = "D:/2.txt";
StringBuilder content1 = new StringBuilder();
StringBuilder content2 = new StringBuilder();
String line;
BufferedReader reader = null;
reader = new BufferedReader(new FileReader(filePath1)); //将两篇文章读入
while ((line = reader.readLine()) != null) {
content1.append(line);
content1.append(System.lineSeparator());
}
reader.close();
reader = new BufferedReader(new FileReader(filePath2));
while ((line = reader.readLine()) != null) {
content2.append(line);
content2.append(System.lineSeparator());
}
reader.close();
String paper1= new String(content1);
String paper2= new String(content2);
System.out.println(paper1+"xx");
System.out.println(paper2+"xx");
List<Term> segList1 = HanLP.segment(paper1); //用分词工具包对文章进行分词
List<Term> segList2 = HanLP.segment(paper2);
System.out.println("1"+segList1+"xx");
System.out.println("2"+segList2+"xx");
//String[] words1 = paper1.split(",");
ArrayList<String> list1=new ArrayList<>();
Map<String, Integer> wordCountMap1 = new HashMap<>();
for(int i=0;i<segList1.size();i++) {
list1.add(segList1.get(i).word);
}
for (String word : list1) {
wordCountMap1.put(word, wordCountMap1.getOrDefault(word, 0) + 1);
}
ArrayList<String> list2=new ArrayList<>();
Map<String, Integer> wordCountMap2 = new HashMap<>(); //将每个分词存入Map,并对应相应的值
for(int i=0;i<segList2.size();i++) {
list2.add(segList2.get(i).word);
}
for (String word : list2) {
wordCountMap2.put(word, wordCountMap2.getOrDefault(word, 0) + 1);
}
double rate=calculateCosSimilarity(wordCountMap1, wordCountMap2); //使用余弦计算法计算相似度
System.out.println(new DecimalFormat("0.00").format(rate));
BufferedWriter out = new BufferedWriter(new FileWriter(filePath3));
out.write(new DecimalFormat("0.00").format(rate));
out.close();
}
}
分词工具包
我是使用hanlp分词工具
详细教程看这里
PSP表格
PSP2.1 | Personal Software Process Stages | 预估耗时(分钟) | 实际耗时(分钟) |
---|---|---|---|
Planning | 计划 | 20 | 20 |
Estimate | 估计这个任务需要多少时间 | 10 | 10 |
Development | 开发 | 10 | 10 |
Analysis | 需求分析 (包括学习新技术) | 10 | 10 |
Design Spec | 生成设计文档 | 5 | 5 |
Design Review | 设计复审 | 10 | 10 |
Coding Standard | 代码规范 (为目前的开发制定合适的规范) | 10 | 10 |
Design | 具体设计 | 10 | 10 |
Coding | 具体编码 | 30 | 30 |
Code Review | 代码复审 | 20 | 20 |
Test | 测试(自我测试,修改代码,提交修改) | 20 | 20 |
Reporting | 报告 | 10 | 10 |
Test Repor | 测试报告 | 5 | 5 |
Size Measurement | 计算工作量 | 10 | 10 |
Postmortem & Process Improvement Plan | 事后总结, 并提出过程改进计划 | 10 | 10 |
合计 | 190 | 190 |