第一次个人编程作业

| 这个作业属于哪个课程 |计科22级12班|

| ----------------- | --------------- |

| 这个作业要求在哪里|个人项目|

| 这个作业的目标 |设计一个论文查重算法，完成软件设计的个人开发流程|

我的GitHub：GitHub

1.PSP表格

PSP是卡耐基梅隆大学（CMU）的专家们针对软件工程师所提出的一套模型：Personal Software Process (PSP，个人开发流程，或称个体软件过程)。

PSP2.1	Personal Software Process Stages	预估耗时（分钟）	实际耗时（分钟）
Planning	计划	25	20
Estimate	估计这个任务需要多少时间	25	20
Development	开发	725	705
Analysis	需求分析 (包括学习新技术)	240	350
Design Spec	生成设计文档	15	20
Design Review	设计复审	30	25
Coding Standard	代码规范 (为目前的开发制定合适的规范)	20	30
Design	具体设计	60	50
Coding	具体编码	240	120
Code Review	代码复审	60	40
Test	测试（自我测试，修改代码，提交修改	60	70
Code Review	代码复审	180	229
Reporting	报告	30	45
Test Repor	测试报告	30	45
Size Measurement	计算工作量	30	35
Postmortem & Process Improvement Plan	事后总结, 并提出过程改进计划	120	150
	合计	930	955

2.整体流程

3.开发环境

编程语言：java
IDE:Intellij IDEA 2024.1

4.算法代码

MainClass：

点击查看代码

package com.my.check;

import java.text.DecimalFormat;
import java.util.Scanner;

public class MainClass {
	
	public static void main(String[] args) {
		
		try (Scanner scn = new Scanner(System.in);){
//			StringBuffer sb1 = ReadTxt.readTxt("C:\\Users\\10973\\Desktop\\test\\orig.txt");
//			StringBuffer sb2 = ReadTxt.readTxt("C:\\Users\\10973\\Desktop\\test\\orig_0.8_add.txt");
			System.out.print("请输入抄袭版论文的文件的绝对路径：");
			String path = scn.next();
			if(path.indexOf("\\") > 0 && path.indexOf("\\\\") < 0) {
				path = path.replace("\\", "\\\\");
				System.out.println(path);
				
			}
			System.out.println(path);
			StringBuffer sb1 = ReadTxt.readTxt(path);
			
			
			System.out.print("请输入论文原文的绝对路径：");
			String txtPath = scn.next();
			if(path.indexOf("\\") > 0 && path.indexOf("\\\\") < 0) {
				txtPath = txtPath.replaceAll("\\", "\\\\");
				System.out.println(txtPath);
				
			}
			StringBuffer sb2 = ReadTxt.readTxt(txtPath);

			
			SimHash hash1 = new  SimHash(sb1.toString(),  64 );
			SimHash hash2 = new  SimHash(sb2.toString(),  64 );
			
			int dis = hash1.getDistance(hash1.getStrSimHash() , hash2.getStrSimHash());
			DecimalFormat decimalFormat = new DecimalFormat(".00");
	        System.out.println(hash1.hammingDistance(hash2) + " "  + dis);
	        System.out.println("hash1和hash2的相似率："+ decimalFormat.format((dis/64.0)));
	        
	        
	        String outpath = "D:\\develop\\dxadd-main\\dxadd-main\\Check\\test\\output.txt";
	        
	        
	        String content = "\r\n抄袭版论文的文件的绝对路径：" + path + "\r\n论文原文的绝对路径：" + txtPath + "\r\n论文重复率为：" + decimalFormat.format((dis/64.0));
	        ReadTxt.writeTxt(outpath, content);
		} catch (NullPointerException e1) {
			System.out.println("输入路径错误或者文件不存在！");
			e1.printStackTrace();
		} catch (Exception e) {
			System.out.println("输入错误导致程序出错！");
			e.printStackTrace();
		}
		
       
	}
}

ReadTxt：

点击查看代码

package com.my.check;

import java.io.BufferedReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;

import java.io.InputStreamReader;

public class ReadTxt {
	/**传入txt路径读取txt文件
     * @param txtPath
     * @return 返回读取到的内容
     */
    public static StringBuffer readTxt(String txtPath) {
        File file = new File(txtPath);
        
        if(file.isFile() && file.exists()){
            try (FileInputStream fileInputStream = new FileInputStream(file);
                 InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
                 BufferedReader bufferedReader = new BufferedReader(inputStreamReader);){
//                FileInputStream fileInputStream = new FileInputStream(file);
//                InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
//                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);   
                StringBuffer sb = new StringBuffer();
                String text = null;
                while((text = bufferedReader.readLine()) != null){
                    sb.append(text);
                }
                return sb;
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    

    /**使用FileOutputStream来写入txt文件
     * @param txtPath txt文件路径
     * @param content 需要写入的文本
     */
    public static void writeTxt(String txtPath,String content){    
       File file = new File(txtPath);
       try (FileOutputStream fileOutputStream = new FileOutputStream(file);){
           if(!file.exists()){
               //判断文件是否存在，如果不存在就新建一个txt
               file.createNewFile();
           }        
           fileOutputStream.write(content.getBytes());
           fileOutputStream.flush();
       } catch (Exception e) {
           e.printStackTrace();
       }
    }
    public static void main(String[] args) {
		StringBuffer sb = ReadTxt.readTxt("D:\\develop\\dxadd-main\\dxadd-main\\Check\\test\\orig.txt");
		System.out.println(sb.toString());
	}
}

SimHash：

点击查看代码

package com.my.check;


import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

public class SimHash {

		 
	     private String tokens;
	 
	     private BigInteger intSimHash;
	 
	     private String strSimHash;
	 
	     private int  hashbits;
	 
	     public SimHash(String tokens) {
	         this .tokens = tokens;
	         this .intSimHash = this .simHash();
	     }
	 
	     public SimHash(String tokens,  int hashbits) {
	         this .tokens = tokens;
	         this .hashbits = hashbits;
	         this .intSimHash = this .simHash();
	     }
	 
//	     HashMap<String, integer=" "> wordMap = new  HashMap<String, integer=" ">();
	 
	     public BigInteger simHash() {
	         // 定义特征向量/数组
	         int [] v = new  int [ this .hashbits];
	         // 1、将文本去掉格式后, 分词.
	         StringTokenizer stringTokens = new  StringTokenizer( this .tokens,"，。！、：“”");
	         while (stringTokens.hasMoreTokens()) {
	             String temp = stringTokens.nextToken();
//	             System.out.println(temp);
	             // 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
	             BigInteger t = this .hash(temp);
	             for ( int  i =  0 ; i <  this .hashbits; i++) {
	                 BigInteger bitmask = new  BigInteger( "1" ).shiftLeft(i);
	                 // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
	                 // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
	                 // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
	                 if (t.and(bitmask).signum() !=  0 ) {
	                     // 这里是计算整个文档的所有特征的向量和
	                     // 这里实际使用中需要 +- 权重，而不是简单的 +1/-1，
	                     v[i] += 1 ;
	                 } else  {
	                     v[i] -= 1 ;
	                 }
	             }
	         }
	         BigInteger fingerprint = new  BigInteger( "0" );
	         StringBuffer simHashBuffer = new  StringBuffer();
	         for ( int  i =  0 ; i <  this .hashbits; i++) {
	             // 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
	             if (v[i] >=  0 ) {
	                 fingerprint = fingerprint.add( new BigInteger( "1" ).shiftLeft(i));
	                 simHashBuffer.append( "1" );
	             } else  {
	                 simHashBuffer.append( "0" );
	             }
	         }
	         this .strSimHash = simHashBuffer.toString();
//	         System.out.println( this .strSimHash + " length "  +  this .strSimHash.length());
	         return fingerprint;
	     }
	     public BigInteger simHash2() {
	         // 定义特征向量/数组
	         int [] v = new  int [ this .hashbits];
	         // 1、将文本去掉格式后, 分词.
	         StringTokenizer stringTokens = new  StringTokenizer( this .tokens,"，。！、：“”");
	         while (stringTokens.hasMoreTokens()) {
	             String temp = stringTokens.nextToken();
	             
	             // 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
	             BigInteger t = this .hash(temp);
	             for ( int  i =  0 ; i <  this .hashbits; i++) {
	                 BigInteger bitmask = new  BigInteger( "1" ).shiftLeft(i);
	                 // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
	                 // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
	                 // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
	                 if (t.and(bitmask).signum() !=  0 ) {
	                     // 这里是计算整个文档的所有特征的向量和
	                     // 这里实际使用中需要 +- 权重，而不是简单的 +1/-1，
	                     v[i] += 1 ;
	                 } else  {
	                     v[i] -= 1 ;
	                 }
	             }
	         }
	         BigInteger fingerprint = new  BigInteger( "0" );
	         StringBuffer simHashBuffer = new  StringBuffer();
	         for ( int  i =  0 ; i <  this .hashbits; i++) {
	             // 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
	             if (v[i] >=  0 ) {
	                 fingerprint = fingerprint.add( new BigInteger( "1" ).shiftLeft(i));
	                 simHashBuffer.append( "1" );
	             } else  {
	                 simHashBuffer.append( "0" );
	             }
	         }
	         this .strSimHash = simHashBuffer.toString();
	         System.out.println( this .strSimHash + " length "  +  this .strSimHash.length());
	         return fingerprint;
	     }
	     private BigInteger hash(String source) {
	         if (source ==  null || source.length() ==  0 ) {
	             return new  BigInteger( "0" );
	         } else  {
	             char [] sourceArray = source.toCharArray();
	             BigInteger x = BigInteger.valueOf((( long ) sourceArray[ 0 ]) <<  7 );
	             BigInteger m = new  BigInteger( "1000003" );
	             BigInteger mask = new  BigInteger( "2" ).pow( this .hashbits).subtract( new BigInteger( "1" ));
	             for ( char  item : sourceArray) {
	                 BigInteger temp = BigInteger.valueOf(( long ) item);
	                 x = x.multiply(m).xor(temp).and(mask);
	             }
	             x = x.xor( new BigInteger(String.valueOf(source.length())));
	             if (x.equals( new  BigInteger( "-1" ))) {
	                 x = new  BigInteger( "-2" );
	             }
	             return x;
	         }
	     }
	 
	     public int  hammingDistance(SimHash other) {
	 
	         BigInteger x = this .intSimHash.xor(other.intSimHash);
	         int tot =  0 ;
	 
	         // 统计x中二进制位数为1的个数
	         // 我们想想，一个二进制数减去1，那么，从最后那个1（包括那个1）后面的数字全都反了，对吧，然后，n&(n-1)就相当于把后面的数字清0，
	         // 我们看n能做多少次这样的操作就OK了。
	 
	         while (x.signum() !=  0 ) {
	             tot += 1 ;
	             x = x.and(x.subtract( new BigInteger( "1" )));
	         }
	         return tot;
	     }
	 
	     public int  getDistance(String str1, String str2) {
	         int distance;
	         if (str1.length() != str2.length()) {
	             distance = - 1 ;
	         } else  {
	             distance = 0 ;
	             for ( int  i =  0 ; i < str1.length(); i++) {
	                 if (str1.charAt(i) == str2.charAt(i)) {
	                     distance++;
	                 }
	             }
	         }
	         return distance;
	     }
	 
//	     public List subByDistance(SimHashDemo simHash,  int distance) {
//	         // 分成几组来检查
//	         int numEach =  this .hashbits / (distance + 1 );
//	         List characters = new  ArrayList();
//	 
//	         StringBuffer buffer = new  StringBuffer();
//	 
//	         int k =  0 ;
//	         for ( int  i =  0 ; i <  this .intSimHash.bitLength(); i++) {
//	             // 当且仅当设置了指定的位时，返回 true
//	             boolean sr = simHash.intSimHash.testBit(i);
//	 
//	             if (sr) {
//	                 buffer.append( "1" );
//	             } else  {
//	                 buffer.append( "0" );
//	             }
//	 
//	             if ((i +  1 ) % numEach == 0 ) {
//	                 // 将二进制转为BigInteger
//	                 BigInteger eachValue = new  BigInteger(buffer.toString(), 2 );
//	                 System.out.println( "----" + eachValue);
//	                 buffer.delete( 0 , buffer.length());
//	                 characters.add(eachValue);
//	             }
//	         }
//	 
//	         return characters;
//	     }
	 
	 
	     public String getTokens() {
			return tokens;
		}

		public void setTokens(String tokens) {
			this.tokens = tokens;
		}

		public BigInteger getIntSimHash() {
			return intSimHash;
		}

		public void setIntSimHash(BigInteger intSimHash) {
			this.intSimHash = intSimHash;
		}

		public String getStrSimHash() {
			return strSimHash;
		}

		public void setStrSimHash(String strSimHash) {
			this.strSimHash = strSimHash;
		}

		public static  void  main(String[] args) {
	         String s = "This is a test string for testing" ;
	         SimHash hash1 = new  SimHash(s,  64 );
	         System.out.println(hash1.intSimHash + "  "  + hash1.intSimHash.bitLength());
//	         hash1.subByDistance(hash1, 3 );
	 
	         s = "This is a test string for testing, This is a test string for testing abcdef" ;
	         SimHash hash2 = new  SimHash(s,  64 );
	         System.out.println(hash2.intSimHash + "  "  + hash2.intSimHash.bitCount());
//	         hash1.subByDistance(hash2, 3 );
	         
	         s = "This is a test string for testing als" ;
	         SimHash hash3 = new  SimHash(s,  64 );
	         System.out.println(hash3.intSimHash + "  "  + hash3.intSimHash.bitCount());
//	         hash1.subByDistance(hash3, 4 );
	         
	         System.out.println( "============================" );
	         
	         int dis = hash1.getDistance(hash1.strSimHash, hash2.strSimHash);
	         System.out.println(hash1.hammingDistance(hash2) + " "  + dis);
	         System.out.println("hash1和hash2的相似率："+ (dis/64.0));
	 
	         int dis2 = hash1.getDistance(hash1.strSimHash, hash3.strSimHash);
	         System.out.println(hash1.hammingDistance(hash3) + " "  + dis2);
	         System.out.println("hash1和hash3的相似率："+ (dis/64));
	         
	         //通过Unicode编码来判断中文
//	         String str = "中国chinese" ;
//	         for ( int  i =  0 ; i < str.length(); i++) {
//	             System.out.println(str.substring(i, i + 1 ).matches( "[\\u4e00-\\u9fbb]+" ));
//	         }
	 
	     }

}

5.JProfile测试

6.结果

输出结果：orig2.txt为我个人创建和原文一模一样的文本
可知输出成功，与原文一致
orig.txt依次与orig_0.8_add.txt，orig_0.8_del.txt，orig_0.8_dis_1.txt，orig_0.8_dis_10.txt，orig_0.8_dis_15.txt比较，并将结果输出到output.txt