Python用哈希算法查找相似图片(包括不同分辨率,不同大小,不同格式的图片)
# -*- coding: utf-8 -*- ''' Python用哈希算法查找相似图片并放入[_df]的文件夹中 相似图片包括不同分辨率,不同大小,不同格式,只要图片相似就会算重复文件 安装cv2 pip install opencv-python ''' import os import cv2 import numpy as np import shutil import random class DuplicateFiles (object): dir = '' def __init__(self, dir): self.dir = dir # 实例属性 # 均值哈希算法 def aHash(self,img,shape=(10,10)): # 缩放为10*10 img = cv2.resize(img, shape) # 转换为灰度图 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # s为像素和初值为0,hash_str为hash值初值为'' s = 0 hash_str = '' # 遍历累加求像素和 for i in range(shape[0]): for j in range(shape[1]): s = s + gray[i, j] # 求平均灰度 avg = s / 100 # 灰度大于平均值为1相反为0生成图片的hash值 for i in range(shape[0]): for j in range(shape[1]): if gray[i, j] > avg: hash_str = hash_str + '1' else: hash_str = hash_str + '0' return hash_str # 差值感知算法 def dHash(self,img,shape=(10,10)): # 缩放10*11 img = cv2.resize(img, (shape[0]+1, shape[1])) # 转换灰度图 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) hash_str = '' # 每行前一个像素大于后一个像素为1,相反为0,生成哈希 for i in range(shape[0]): for j in range(shape[1]): if gray[i, j] > gray[i, j + 1]: hash_str = hash_str + '1' else: hash_str = hash_str + '0' return hash_str # 感知哈希算法(pHash) def pHash(self,img,shape=(10,10)): # 缩放32*32 img = cv2.resize(img, (32, 32)) # , interpolation=cv2.INTER_CUBIC # 转换为灰度图 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 将灰度图转为浮点型,再进行dct变换 dct = cv2.dct(np.float32(gray)) # opencv实现的掩码操作 dct_roi = dct[0:10, 0:10] hash = [] avreage = np.mean(dct_roi) for i in range(dct_roi.shape[0]): for j in range(dct_roi.shape[1]): if dct_roi[i, j] > avreage: hash.append(1) else: hash.append(0) return hash # 通过得到RGB每个通道的直方图来计算相似度 def classify_hist_with_split(self,image1, image2, size=(256, 256)): # 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值 image1 = cv2.resize(image1, size) image2 = cv2.resize(image2, size) sub_image1 = cv2.split(image1) sub_image2 = cv2.split(image2) sub_data = 0 for im1, im2 in zip(sub_image1, sub_image2): sub_data += self.calculate(im1, im2) sub_data = sub_data / 3 return sub_data # 计算单通道的直方图的相似值 def calculate(self,image1, image2): hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0]) hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0]) # 计算直方图的重合度 degree = 0 for i in range(len(hist1)): if hist1[i] != hist2[i]: degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i])) else: degree = degree + 1 degree = degree / len(hist1) return degree # Hash值对比 def cmpHash(self,hash1, hash2,shape=(10,10)): n = 0 # hash长度不同则返回-1代表传参出错 if len(hash1)!=len(hash2): return -1 # 遍历判断 for i in range(len(hash1)): # 相等则n计数+1,n最终为相似度 if hash1[i] == hash2[i]: n = n + 1 return n/(shape[0]*shape[1]) def mymovefile(self,srcfile,dstpath,ffname): # 移动函数 if not os.path.isfile(srcfile): print ("%s not exist!"%(srcfile)) else: fpath,fname=os.path.split(srcfile) # 分离文件名和路径 if(ffname):fname=ffname if not os.path.exists(dstpath): os.makedirs(dstpath) # 创建路径 shutil.move(srcfile, dstpath + fname) # 移动文件 #print ("move %s -> %s"%(srcfile, dstpath + fname)) # 定义函数 def list_all_files(self,rootdir): _files = [] # 列出文件夹下所有的目录与文件 list = os.listdir(rootdir) for i in range(0, len(list)): # 构造路径 path = os.path.join(rootdir, list[i]) # 判断路径是否为文件目录或者文件 # 如果是目录则继续递归 if os.path.isdir(path): _files.extend(list_all_files(path)) if os.path.isfile(path): _files.append(path) return _files #处理文件 def mvPhoto(self): photoList = self.list_all_files(self.dir) #print(photoList) for i,photo in enumerate(photoList): mvPhoto = False #是否移动主文件 #如果不是文件则跳出 if(not os.path.isfile(photo)): continue fpath,fname=os.path.split(photo) print('Master:'+fname) ffname = fname.split('.') #不是下列文件形式跳出 if(ffname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'gif'}): continue img1 = cv2.imdecode(np.fromfile(photo,dtype=np.uint8),cv2.IMREAD_COLOR) for j in range(i+1,len(photoList)): #print(' ',j,photoList[j]) if(not os.path.isfile(photo) or not os.path.isfile(photoList[j])): continue spath,sname=os.path.split(photoList[j]) #print(sname) ssname = sname.split('.') if(ssname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'jfif'}): continue #img1 = cv2.imread(photo) img2 = cv2.imdecode(np.fromfile(photoList[j],dtype=np.uint8),cv2.IMREAD_COLOR) #hash1 = aHash(img1) #hash2 = aHash(img2) n1 = self.cmpHash(self.aHash(img1), self.aHash(img2)) n2 = self.cmpHash(self.dHash(img1), self.dHash(img2)) n3 = self.cmpHash(self.pHash(img1), self.pHash(img2)) n4 = self.classify_hist_with_split(img1, img2) n5 = self.calculate(img1, img2) #print(' ',n1,n2,n3,n4,n5) if(n1>0.90 or n2>0.90 or n3>0.90 or n4>0.90 or n5>0.90): mvPhoto = True print(' move file:'+photoList[j]) if(os.path.isfile(photoList[j])): print('ffname[0]:'+ffname[0]) #mymovefile(photoList[j],dir+'_重复'+'/',ffname[0]+'_'+str(random.randint(10,99))+'.'+ffname[1]) self.mymovefile(photoList[j],dir+'_df'+'/',ffname[0]+'_'+sname) #最后移动主文件 if(mvPhoto==True): self.mymovefile(photo,dir+'_df'+'/',fname) if __name__ == "__main__": #指定路径 #dir = r'E:\python\photoCompare\328' #指定目录地址 dir = os.getcwd() #当前文件所在目录 duplicateFiles = DuplicateFiles(dir) duplicateFiles.mvPhoto()