Python用哈希算法查找相似图片(包括不同分辨率,不同大小,不同格式的图片)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | # -*- coding: utf-8 -*- ''' Python用哈希算法查找相似图片并放入[_df]的文件夹中 相似图片包括不同分辨率,不同大小,不同格式,只要图片相似就会算重复文件 安装cv2 pip install opencv-python ''' import os import cv2 import numpy as np import shutil import random class DuplicateFiles ( object ): dir = '' def __init__( self , dir ): self . dir = dir # 实例属性 # 均值哈希算法 def aHash( self ,img,shape = ( 10 , 10 )): # 缩放为10*10 img = cv2.resize(img, shape) # 转换为灰度图 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # s为像素和初值为0,hash_str为hash值初值为'' s = 0 hash_str = '' # 遍历累加求像素和 for i in range (shape[ 0 ]): for j in range (shape[ 1 ]): s = s + gray[i, j] # 求平均灰度 avg = s / 100 # 灰度大于平均值为1相反为0生成图片的hash值 for i in range (shape[ 0 ]): for j in range (shape[ 1 ]): if gray[i, j] > avg: hash_str = hash_str + '1' else : hash_str = hash_str + '0' return hash_str # 差值感知算法 def dHash( self ,img,shape = ( 10 , 10 )): # 缩放10*11 img = cv2.resize(img, (shape[ 0 ] + 1 , shape[ 1 ])) # 转换灰度图 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) hash_str = '' # 每行前一个像素大于后一个像素为1,相反为0,生成哈希 for i in range (shape[ 0 ]): for j in range (shape[ 1 ]): if gray[i, j] > gray[i, j + 1 ]: hash_str = hash_str + '1' else : hash_str = hash_str + '0' return hash_str # 感知哈希算法(pHash) def pHash( self ,img,shape = ( 10 , 10 )): # 缩放32*32 img = cv2.resize(img, ( 32 , 32 )) # , interpolation=cv2.INTER_CUBIC # 转换为灰度图 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 将灰度图转为浮点型,再进行dct变换 dct = cv2.dct(np.float32(gray)) # opencv实现的掩码操作 dct_roi = dct[ 0 : 10 , 0 : 10 ] hash = [] avreage = np.mean(dct_roi) for i in range (dct_roi.shape[ 0 ]): for j in range (dct_roi.shape[ 1 ]): if dct_roi[i, j] > avreage: hash .append( 1 ) else : hash .append( 0 ) return hash # 通过得到RGB每个通道的直方图来计算相似度 def classify_hist_with_split( self ,image1, image2, size = ( 256 , 256 )): # 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值 image1 = cv2.resize(image1, size) image2 = cv2.resize(image2, size) sub_image1 = cv2.split(image1) sub_image2 = cv2.split(image2) sub_data = 0 for im1, im2 in zip (sub_image1, sub_image2): sub_data + = self .calculate(im1, im2) sub_data = sub_data / 3 return sub_data # 计算单通道的直方图的相似值 def calculate( self ,image1, image2): hist1 = cv2.calcHist([image1], [ 0 ], None , [ 256 ], [ 0.0 , 255.0 ]) hist2 = cv2.calcHist([image2], [ 0 ], None , [ 256 ], [ 0.0 , 255.0 ]) # 计算直方图的重合度 degree = 0 for i in range ( len (hist1)): if hist1[i] ! = hist2[i]: degree = degree + ( 1 - abs (hist1[i] - hist2[i]) / max (hist1[i], hist2[i])) else : degree = degree + 1 degree = degree / len (hist1) return degree # Hash值对比 def cmpHash( self ,hash1, hash2,shape = ( 10 , 10 )): n = 0 # hash长度不同则返回-1代表传参出错 if len (hash1)! = len (hash2): return - 1 # 遍历判断 for i in range ( len (hash1)): # 相等则n计数+1,n最终为相似度 if hash1[i] = = hash2[i]: n = n + 1 return n / (shape[ 0 ] * shape[ 1 ]) def mymovefile( self ,srcfile,dstpath,ffname): # 移动函数 if not os.path.isfile(srcfile): print ( "%s not exist!" % (srcfile)) else : fpath,fname = os.path.split(srcfile) # 分离文件名和路径 if (ffname):fname = ffname if not os.path.exists(dstpath): os.makedirs(dstpath) # 创建路径 shutil.move(srcfile, dstpath + fname) # 移动文件 #print ("move %s -> %s"%(srcfile, dstpath + fname)) # 定义函数 def list_all_files( self ,rootdir): _files = [] # 列出文件夹下所有的目录与文件 list = os.listdir(rootdir) for i in range ( 0 , len ( list )): # 构造路径 path = os.path.join(rootdir, list [i]) # 判断路径是否为文件目录或者文件 # 如果是目录则继续递归 if os.path.isdir(path): _files.extend(list_all_files(path)) if os.path.isfile(path): _files.append(path) return _files #处理文件 def mvPhoto( self ): photoList = self .list_all_files( self . dir ) #print(photoList) for i,photo in enumerate (photoList): mvPhoto = False #是否移动主文件 #如果不是文件则跳出 if ( not os.path.isfile(photo)): continue fpath,fname = os.path.split(photo) print ( 'Master:' + fname) ffname = fname.split( '.' ) #不是下列文件形式跳出 if (ffname[ 1 ] not in { 'jpg' , 'bmp' , 'png' , 'jpeg' , 'gif' }): continue img1 = cv2.imdecode(np.fromfile(photo,dtype = np.uint8),cv2.IMREAD_COLOR) for j in range (i + 1 , len (photoList)): #print(' ',j,photoList[j]) if ( not os.path.isfile(photo) or not os.path.isfile(photoList[j])): continue spath,sname = os.path.split(photoList[j]) #print(sname) ssname = sname.split( '.' ) if (ssname[ 1 ] not in { 'jpg' , 'bmp' , 'png' , 'jpeg' , 'jfif' }): continue #img1 = cv2.imread(photo) img2 = cv2.imdecode(np.fromfile(photoList[j],dtype = np.uint8),cv2.IMREAD_COLOR) #hash1 = aHash(img1) #hash2 = aHash(img2) n1 = self .cmpHash( self .aHash(img1), self .aHash(img2)) n2 = self .cmpHash( self .dHash(img1), self .dHash(img2)) n3 = self .cmpHash( self .pHash(img1), self .pHash(img2)) n4 = self .classify_hist_with_split(img1, img2) n5 = self .calculate(img1, img2) #print(' ',n1,n2,n3,n4,n5) if (n1> 0.90 or n2> 0.90 or n3> 0.90 or n4> 0.90 or n5> 0.90 ): mvPhoto = True print ( ' move file:' + photoList[j]) if (os.path.isfile(photoList[j])): print ( 'ffname[0]:' + ffname[ 0 ]) #mymovefile(photoList[j],dir+'_重复'+'/',ffname[0]+'_'+str(random.randint(10,99))+'.'+ffname[1]) self .mymovefile(photoList[j], dir + '_df' + '/' ,ffname[ 0 ] + '_' + sname) #最后移动主文件 if (mvPhoto = = True ): self .mymovefile(photo, dir + '_df' + '/' ,fname) if __name__ = = "__main__" : #指定路径 #dir = r'E:\python\photoCompare\328' #指定目录地址 dir = os.getcwd() #当前文件所在目录 duplicateFiles = DuplicateFiles( dir ) duplicateFiles.mvPhoto() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
2014-04-21 cocos2d-x场景间参数传递