相似图片去重
import os,cv2
import numpy as np
from pathlib import Path
import json
import sys
import time
# 请用斜杠,因为后面写到html文件的图片url路径需要用斜杠分隔
ds=['C:/dir1']
picSuffix=['.jpg','.jpeg','.png','.webp']
def aHash(img):# 均值哈希算法
img=cv2.resize(img,(8,8))
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
s=0
hashStr=''
for i in range(8):
for j in range(8):
s+=gray[i,j]
avg=s/64
for i in range(8):
for j in range(8):
if gray[i,j]>avg:
hashStr+='1'
else:
hashStr+='0'
return hashStr
def cmpHash(hash1,hash2):
n=0
if len(hash1)!=len(hash2):
return -1
for i in range(len(hash1)):
if hash1[i]!=hash2[i]:
n+=1
return n
def dHash(img): # 差值哈希算法
img=cv2.resize(img,(9,8))
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
hashStr=''
for i in range(8):
for j in range(8):
if gray[i,j]>gray[i,j+1]:
hashStr+='1'
else:
hashStr+='0'
return hashStr
def pHash(img): # 感知哈希算法
img=cv2.resize(img,(32,32))
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
dct=cv2.dct(np.float32(gray))
dct_roi=dct[0:8,0:8]
hashStr=''
average=np.mean(dct_roi)
for i in range(dct_roi.shape[0]):
for j in range(dct_roi.shape[1]):
if dct_roi[i,j]>average:
hashStr+='1'
else:
hashStr+='0'
return hashStr
def strDuration(duration):
return f'{duration//60}分钟{int(duration%60)}秒'
# 目录里的图片还没有进行过去重,从头开始计算图片的特征值,并存储起来
def build():
summary={}
for d in ds:
start=time.time()
pics=[]
for f in os.listdir(d):
if os.path.splitext(f)[1] in picSuffixs:
pics.append(f)
else:
print(f'{f} will be abandoned')
v=[0]*len(pics)
os.chdir(d)
di={}
for pic in pics:
print(f'processing {pic}')
content=cv2.imread(pic)
di[pic]={'ahash':aHash(content),'dhash':dHash(content),'phash':pHash(content)}
with open('picId.json','w',encoding='utf8') as f:
json.dump(di,f,ensure_ascii=False)
os.chdir(os.path.dirname(__file__))
if os.path.exists('need2delete.txt'):
os.remove('need2delete.txt')
excluded=(())
for i in range(len(pics)):
if v[i]:
continue
for j in range(i+1,len(pics)):
if v[j] or (i,j) in excluded:
continue
print(f'processing {pics[i]} and {pics[j]}')
n1,n2,n3=cmpHash(di[pics[i]]['ahash'],di[pics[j]]['ahash']),cmpHash(di[pics[i]]['dhash'],di[pics[j]]['dhash']),cmpHash(di[pics[i]]['phash'],di[pics[j]]['phash'])
if n1<=10 and n2<=10 and n3<=10:
v[j]=1
# os.remove(os.path.join(d,pics[j]))
with open('need2delete.txt','a',encoding='utf8') as f:
f.write(f'{os.path.join(d,pics[i])}\n{os.path.join(d,pics[j])}\n')
summary[d]={'duration':time.time()-start,'picNum':len(pics)}
for d in summary:
print(f'在{d}目录下,处理了{summary[d]["picNum"]}张图片,花费{strDuration(summary[d]["duration"])}时间')
def show(imgn):
img=cv2.imread(imgn)
h,w=img.shape[0],img.shape[1]
img=cv2.resize(img,(w*480//h,480))
cv2.imshow(imgn,img)
def observe():
with open('need2delete.txt',encoding='utf8') as f:
contents=f.read()
pics=contents.split('\n')
if len(pics)==0:
print('没有重复的图片')
return
for i in range(len(pics)//2):
show(pics[2*i])
show(pics[2*i+1])
print(f'showing {pics[2*i]} and {pics[2*i+1]}')
cv2.waitKey(0)
cv2.destroyAllWindows()
def delete():
with open('need2delete.txt',encoding='utf8') as f:
contents=f.read()
pics=contents.split('\n')
gd={}
for i in range(len(pics)//2):
os.remove(pics[2*i+1])
if os.path.dirname(pics[2*i+1]) not in gd:
with open(os.path.join(os.path.dirname(pics[2*i+1]),'picId.json'),'r',encoding='utf8') as f:
d=json.load(f)
gd[os.path.dirname(pics[2*i+1])]=d
gd[os.path.dirname(pics[2*i+1])].pop(os.path.basename(pics[2*i+1]))
for k in gd.keys():
with open(os.path.join(k,'picId.json'),'w',encoding='utf8') as f:
json.dump(gd[k],f,ensure_ascii=False)
def update():
summary={}
for d in ds:
start=time.time()
pics=[]
for f in os.listdir(d):
if os.path.splitext(f)[1] in picSuffixs:
pics.append(f)
else:
print(f'{f} will be abandoned')
os.chdir(d)
with open('picId.json','r',encoding='utf8') as f:
od=json.load(f)
nd={}
for pic in pics:
if pic not in od.keys():
print(f'processing {pic}')
content=cv2.imread(pic)
nd[pic]={'ahash':aHash(content),'dhash':dHash(content),'phash':pHash(content)}
os.chdir(os.path.dirname(__file__))
if os.path.exists('need2delete.txt'):
os.remove('need2delete.txt')
for odk in od.keys():
for ndk in nd.keys():
print(f'processing {odk} and {ndk}')
n1,n2,n3=cmpHash(od[odk]['ahash'],nd[ndk]['ahash']),cmpHash(od[odk]['dhash'],nd[ndk]['dhash']),cmpHash(od[odk]['phash'],nd[ndk]['phash'])
if n1<=10 and n2<=10 and n3<=10:
with open('need2delete.txt','a',encoding='utf8') as f:
f.write(f'{os.path.join(d,odk)}\n{os.path.join(d,ndk)}\n')
od.update(nd)
os.chdir(d)
with open('picId.json','w',encoding='utf8') as f:
json.dump(od,f,ensure_ascii=False)
summary[d]={'duration':time.time()-start,'opn':len(od.keys()),'npn':len(nd.keys())}
for d in summary:
print(f'在{d}目录下,比对{summary[d]["opn"]}张老图片和{summary[d]["npn"]}张新图片的重复情况,用时{strDuration(summary[d]["duration"])}')
def test():
pass
if __name__=='__main__':
if sys.argv[1]=='build':
build()
elif sys.argv[1]=='observe':
observe()
elif sys.argv[1]=='delete':
delete()
elif sys.argv[1]=='update':
update()
elif sys.argv[1]=='test':
test()
else:
print(f'{os.path.basename(__file__)} build,计算目录里的图片的特征值并存储为json格式,以及发现重复的图片\n{os.path.basename(__file__)} observe,查看程序认为重复的图片\n{os.path.basename(__file__)} delete,删除重复的图片,请先人眼再次核实是否重复\n{os.path.basename(__file__)}\n{os.path.basename(__file__)} update,计算之前没计算的图片的特征值并发现和旧图片重复的新图片')
用法可以通过python deduplicateSimilarPictures.py help
命令获取四种操作模式。
build计算目录中的图片的特征值,检查相似度,以发现重复的图片。存储图片的特征值
observe查看build或update找出的相似的图片,查看的时候可以打开need2delete.txt文本文档,删去其实并不重复的两张图片的名字。
delete删去need2delete.txt里面两个图片名为一组的第2张图片,更新存储的图片特征值。
update找出目录里面新来的图片,跟以前计算好特征值的图片比较,找出重复的图片。 更新存储的图片的特征值。
计算的图片的特征值"0b5c55b7149d7635ad4069825c5a81b6.jpeg": {"ahash": "0000111110111100010111100001110100001001000101110100011100000101", "dhash": "1010010100101011010010010100101000101100101000100101011010110010", "phash": "1000000011010100011100100010111100101001000111000000010001000001"},
创建于2023.1.6/18.16