import os,cv2
import numpy as np
from pathlib import Path
import json
import sys
import time
# 请用斜杠,因为后面写到html文件的图片url路径需要用斜杠分隔
ds=['C:/dir1']
picSuffix=['.jpg','.jpeg','.png','.webp']
defaHash(img):# 均值哈希算法
img=cv2.resize(img,(8,8))
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
s=0
hashStr=''for i inrange(8):
for j inrange(8):
s+=gray[i,j]
avg=s/64for i inrange(8):
for j inrange(8):
if gray[i,j]>avg:
hashStr+='1'else:
hashStr+='0'return hashStr
defcmpHash(hash1,hash2):
n=0iflen(hash1)!=len(hash2):
return -1for i inrange(len(hash1)):
if hash1[i]!=hash2[i]:
n+=1return n
defdHash(img): # 差值哈希算法
img=cv2.resize(img,(9,8))
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
hashStr=''for i inrange(8):
for j inrange(8):
if gray[i,j]>gray[i,j+1]:
hashStr+='1'else:
hashStr+='0'return hashStr
defpHash(img): # 感知哈希算法
img=cv2.resize(img,(32,32))
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
dct=cv2.dct(np.float32(gray))
dct_roi=dct[0:8,0:8]
hashStr=''
average=np.mean(dct_roi)
for i inrange(dct_roi.shape[0]):
for j inrange(dct_roi.shape[1]):
if dct_roi[i,j]>average:
hashStr+='1'else:
hashStr+='0'return hashStr
defstrDuration(duration):
returnf'{duration//60}分钟{int(duration%60)}秒'# 目录里的图片还没有进行过去重,从头开始计算图片的特征值,并存储起来defbuild():
summary={}
for d in ds:
start=time.time()
pics=[]
for f in os.listdir(d):
if os.path.splitext(f)[1] in picSuffixs:
pics.append(f)
else:
print(f'{f} will be abandoned')
v=[0]*len(pics)
os.chdir(d)
di={}
for pic in pics:
print(f'processing {pic}')
content=cv2.imread(pic)
di[pic]={'ahash':aHash(content),'dhash':dHash(content),'phash':pHash(content)}
withopen('picId.json','w',encoding='utf8') as f:
json.dump(di,f,ensure_ascii=False)
os.chdir(os.path.dirname(__file__))
if os.path.exists('need2delete.txt'):
os.remove('need2delete.txt')
excluded=(())
for i inrange(len(pics)):
if v[i]:
continuefor j inrange(i+1,len(pics)):
if v[j] or (i,j) in excluded:
continueprint(f'processing {pics[i]} and {pics[j]}')
n1,n2,n3=cmpHash(di[pics[i]]['ahash'],di[pics[j]]['ahash']),cmpHash(di[pics[i]]['dhash'],di[pics[j]]['dhash']),cmpHash(di[pics[i]]['phash'],di[pics[j]]['phash'])
if n1<=10and n2<=10and n3<=10:
v[j]=1# os.remove(os.path.join(d,pics[j]))withopen('need2delete.txt','a',encoding='utf8') as f:
f.write(f'{os.path.join(d,pics[i])}\n{os.path.join(d,pics[j])}\n')
summary[d]={'duration':time.time()-start,'picNum':len(pics)}
for d in summary:
print(f'在{d}目录下,处理了{summary[d]["picNum"]}张图片,花费{strDuration(summary[d]["duration"])}时间')
defshow(imgn):
img=cv2.imread(imgn)
h,w=img.shape[0],img.shape[1]
img=cv2.resize(img,(w*480//h,480))
cv2.imshow(imgn,img)
defobserve():
withopen('need2delete.txt',encoding='utf8') as f:
contents=f.read()
pics=contents.split('\n')
iflen(pics)==0:
print('没有重复的图片')
returnfor i inrange(len(pics)//2):
show(pics[2*i])
show(pics[2*i+1])
print(f'showing {pics[2*i]} and {pics[2*i+1]}')
cv2.waitKey(0)
cv2.destroyAllWindows()
defdelete():
withopen('need2delete.txt',encoding='utf8') as f:
contents=f.read()
pics=contents.split('\n')
gd={}
for i inrange(len(pics)//2):
os.remove(pics[2*i+1])
if os.path.dirname(pics[2*i+1]) notin gd:
withopen(os.path.join(os.path.dirname(pics[2*i+1]),'picId.json'),'r',encoding='utf8') as f:
d=json.load(f)
gd[os.path.dirname(pics[2*i+1])]=d
gd[os.path.dirname(pics[2*i+1])].pop(os.path.basename(pics[2*i+1]))
for k in gd.keys():
withopen(os.path.join(k,'picId.json'),'w',encoding='utf8') as f:
json.dump(gd[k],f,ensure_ascii=False)
defupdate():
summary={}
for d in ds:
start=time.time()
pics=[]
for f in os.listdir(d):
if os.path.splitext(f)[1] in picSuffixs:
pics.append(f)
else:
print(f'{f} will be abandoned')
os.chdir(d)
withopen('picId.json','r',encoding='utf8') as f:
od=json.load(f)
nd={}
for pic in pics:
if pic notin od.keys():
print(f'processing {pic}')
content=cv2.imread(pic)
nd[pic]={'ahash':aHash(content),'dhash':dHash(content),'phash':pHash(content)}
os.chdir(os.path.dirname(__file__))
if os.path.exists('need2delete.txt'):
os.remove('need2delete.txt')
for odk in od.keys():
for ndk in nd.keys():
print(f'processing {odk} and {ndk}')
n1,n2,n3=cmpHash(od[odk]['ahash'],nd[ndk]['ahash']),cmpHash(od[odk]['dhash'],nd[ndk]['dhash']),cmpHash(od[odk]['phash'],nd[ndk]['phash'])
if n1<=10and n2<=10and n3<=10:
withopen('need2delete.txt','a',encoding='utf8') as f:
f.write(f'{os.path.join(d,odk)}\n{os.path.join(d,ndk)}\n')
od.update(nd)
os.chdir(d)
withopen('picId.json','w',encoding='utf8') as f:
json.dump(od,f,ensure_ascii=False)
summary[d]={'duration':time.time()-start,'opn':len(od.keys()),'npn':len(nd.keys())}
for d in summary:
print(f'在{d}目录下,比对{summary[d]["opn"]}张老图片和{summary[d]["npn"]}张新图片的重复情况,用时{strDuration(summary[d]["duration"])}')
deftest():
passif __name__=='__main__':
if sys.argv[1]=='build':
build()
elif sys.argv[1]=='observe':
observe()
elif sys.argv[1]=='delete':
delete()
elif sys.argv[1]=='update':
update()
elif sys.argv[1]=='test':
test()
else:
print(f'{os.path.basename(__file__)} build,计算目录里的图片的特征值并存储为json格式,以及发现重复的图片\n{os.path.basename(__file__)} observe,查看程序认为重复的图片\n{os.path.basename(__file__)} delete,删除重复的图片,请先人眼再次核实是否重复\n{os.path.basename(__file__)}\n{os.path.basename(__file__)} update,计算之前没计算的图片的特征值并发现和旧图片重复的新图片')
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?