余弦相似度判断
1.两个文件相似度比较
文本文件
1)先切词,编码把文本向量化处理
2)使用余弦定理计算
二进制文件
读取文件数值,直接判断
#!/usr/bin/python3 # -*- coding: UTF-8 -*- import sys import numpy as np from sklearn.metrics.pairwise import cosine_similarity with open(sys.argv[1], 'rb') as f1: pcm1 = np.array([int(_) for _ in f1.read()]) with open(sys.argv[2], 'rb') as f2: pcm2 = np.array([int(_) for _ in f2.read()]) if np.size(pcm1) > np.size(pcm2): delta = np.size(pcm1) - np.size(pcm2) pcm2 = np.pad(pcm2, ((0, delta)), mode='constant') elif np.size(pcm1) < np.size(pcm2): delta = np.size(pcm2) - np.size(pcm1) pcm1 = np.pad(pcm1, ((0, delta)), mode='constant') else: delta = np.array(0) sim = cosine_similarity(np.expand_dims(pcm1, axis=0), np.expand_dims(pcm2, axis=0)) print('两个pcm相似度:', sim[0][0])