相似图片去重

import os,cv2
import numpy as np
from pathlib import Path
import json
import sys
import time

# 请用斜杠,因为后面写到html文件的图片url路径需要用斜杠分隔
ds=['C:/dir1']
picSuffix=['.jpg','.jpeg','.png','.webp']

def aHash(img):# 均值哈希算法
	img=cv2.resize(img,(8,8))
	gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
	s=0
	hashStr=''
	for i in range(8):
		for j in range(8):
			s+=gray[i,j]
	avg=s/64
	for i in range(8):
		for j in range(8):
			if gray[i,j]>avg:
				hashStr+='1'
			else:
				hashStr+='0'
	return hashStr

def cmpHash(hash1,hash2):
	n=0
	if len(hash1)!=len(hash2):
		return -1
	for i in range(len(hash1)):
		if hash1[i]!=hash2[i]:
			n+=1
	return n

def dHash(img): # 差值哈希算法
	img=cv2.resize(img,(9,8))
	gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
	hashStr=''
	for i in range(8):
		for j in range(8):
			if gray[i,j]>gray[i,j+1]:
				hashStr+='1'
			else:
				hashStr+='0'
	return hashStr

def pHash(img): # 感知哈希算法
	img=cv2.resize(img,(32,32))
	gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
	dct=cv2.dct(np.float32(gray))
	dct_roi=dct[0:8,0:8]
	hashStr=''
	average=np.mean(dct_roi)
	for i in range(dct_roi.shape[0]):
		for j in range(dct_roi.shape[1]):
			if dct_roi[i,j]>average:
				hashStr+='1'
			else:
				hashStr+='0'
	return hashStr

def strDuration(duration):
	return f'{duration//60}分钟{int(duration%60)}秒'

# 目录里的图片还没有进行过去重,从头开始计算图片的特征值,并存储起来
def build():
	summary={}
	for d in ds:
		start=time.time()
		pics=[]
		for f in os.listdir(d):
			if os.path.splitext(f)[1] in picSuffixs:
				pics.append(f)
			else:
				print(f'{f} will be abandoned')
		v=[0]*len(pics)
		os.chdir(d)

		di={}
		for pic in pics:
			print(f'processing {pic}')
			content=cv2.imread(pic)
			di[pic]={'ahash':aHash(content),'dhash':dHash(content),'phash':pHash(content)}
		with open('picId.json','w',encoding='utf8') as f:
			json.dump(di,f,ensure_ascii=False)
		os.chdir(os.path.dirname(__file__))
		if os.path.exists('need2delete.txt'):
			os.remove('need2delete.txt')
		excluded=(())
		for i in range(len(pics)):
			if v[i]:
				continue
			for j in range(i+1,len(pics)):
				if v[j] or (i,j) in excluded:
					continue
				print(f'processing {pics[i]} and {pics[j]}')
				n1,n2,n3=cmpHash(di[pics[i]]['ahash'],di[pics[j]]['ahash']),cmpHash(di[pics[i]]['dhash'],di[pics[j]]['dhash']),cmpHash(di[pics[i]]['phash'],di[pics[j]]['phash'])

				if n1<=10 and n2<=10 and n3<=10:
					v[j]=1
					# os.remove(os.path.join(d,pics[j]))
					with open('need2delete.txt','a',encoding='utf8') as f:
						f.write(f'{os.path.join(d,pics[i])}\n{os.path.join(d,pics[j])}\n')
		summary[d]={'duration':time.time()-start,'picNum':len(pics)}
	for d in summary:
		print(f'在{d}目录下,处理了{summary[d]["picNum"]}张图片,花费{strDuration(summary[d]["duration"])}时间')

def show(imgn):
	img=cv2.imread(imgn)
	h,w=img.shape[0],img.shape[1]
	img=cv2.resize(img,(w*480//h,480))
	cv2.imshow(imgn,img)

def observe():
	with open('need2delete.txt',encoding='utf8') as f:
		contents=f.read()
	pics=contents.split('\n')
	if len(pics)==0:
		print('没有重复的图片')
		return
	for i in range(len(pics)//2):
		show(pics[2*i])
		show(pics[2*i+1])
		print(f'showing {pics[2*i]} and {pics[2*i+1]}')
		cv2.waitKey(0)
		cv2.destroyAllWindows()

def delete():
	with open('need2delete.txt',encoding='utf8') as f:
		contents=f.read()
	pics=contents.split('\n')
	gd={}
	for i in range(len(pics)//2):
		os.remove(pics[2*i+1])
		if os.path.dirname(pics[2*i+1]) not in gd:
			with open(os.path.join(os.path.dirname(pics[2*i+1]),'picId.json'),'r',encoding='utf8') as f:
				d=json.load(f)
			gd[os.path.dirname(pics[2*i+1])]=d
		gd[os.path.dirname(pics[2*i+1])].pop(os.path.basename(pics[2*i+1]))
	for k in gd.keys():
		with open(os.path.join(k,'picId.json'),'w',encoding='utf8') as f:
			json.dump(gd[k],f,ensure_ascii=False)

def update():
	summary={}
	for d in ds:
		start=time.time()
		pics=[]
		for f in os.listdir(d):
			if os.path.splitext(f)[1] in picSuffixs:
				pics.append(f)
			else:
				print(f'{f} will be abandoned')
		os.chdir(d)
		with open('picId.json','r',encoding='utf8') as f:
			od=json.load(f)
		nd={}
		for pic in pics:
			if pic not in od.keys():
				print(f'processing {pic}')
				content=cv2.imread(pic)
				nd[pic]={'ahash':aHash(content),'dhash':dHash(content),'phash':pHash(content)}
		os.chdir(os.path.dirname(__file__))
		if os.path.exists('need2delete.txt'):
			os.remove('need2delete.txt')
		for odk in od.keys():
			for ndk in nd.keys():
				print(f'processing {odk} and {ndk}')
				n1,n2,n3=cmpHash(od[odk]['ahash'],nd[ndk]['ahash']),cmpHash(od[odk]['dhash'],nd[ndk]['dhash']),cmpHash(od[odk]['phash'],nd[ndk]['phash'])

				if n1<=10 and n2<=10 and n3<=10:
					with open('need2delete.txt','a',encoding='utf8') as f:
						f.write(f'{os.path.join(d,odk)}\n{os.path.join(d,ndk)}\n')
		od.update(nd)
		os.chdir(d)
		with open('picId.json','w',encoding='utf8') as f:
			json.dump(od,f,ensure_ascii=False)
		summary[d]={'duration':time.time()-start,'opn':len(od.keys()),'npn':len(nd.keys())}
	for d in summary:
		print(f'在{d}目录下,比对{summary[d]["opn"]}张老图片和{summary[d]["npn"]}张新图片的重复情况,用时{strDuration(summary[d]["duration"])}')

def test():
	pass

if __name__=='__main__':
	if sys.argv[1]=='build':
		build()
	elif sys.argv[1]=='observe':
		observe()
	elif sys.argv[1]=='delete':
		delete()
	elif sys.argv[1]=='update':
		update()
	elif sys.argv[1]=='test':
		test()
	else:
		print(f'{os.path.basename(__file__)} build,计算目录里的图片的特征值并存储为json格式,以及发现重复的图片\n{os.path.basename(__file__)} observe,查看程序认为重复的图片\n{os.path.basename(__file__)} delete,删除重复的图片,请先人眼再次核实是否重复\n{os.path.basename(__file__)}\n{os.path.basename(__file__)} update,计算之前没计算的图片的特征值并发现和旧图片重复的新图片')

用法可以通过python deduplicateSimilarPictures.py help命令获取四种操作模式。

build计算目录中的图片的特征值,检查相似度,以发现重复的图片。存储图片的特征值

observe查看build或update找出的相似的图片,查看的时候可以打开need2delete.txt文本文档,删去其实并不重复的两张图片的名字。

delete删去need2delete.txt里面两个图片名为一组的第2张图片,更新存储的图片特征值。

update找出目录里面新来的图片,跟以前计算好特征值的图片比较,找出重复的图片。 更新存储的图片的特征值。

计算的图片的特征值"0b5c55b7149d7635ad4069825c5a81b6.jpeg": {"ahash": "0000111110111100010111100001110100001001000101110100011100000101", "dhash": "1010010100101011010010010100101000101100101000100101011010110010", "phash": "1000000011010100011100100010111100101001000111000000010001000001"},

参考链接:python OpenCV 图片相似度 5种算法

创建于2023.1.6/18.16

posted @ 2023-01-06 18:18  园糯  阅读(131)  评论(0编辑  收藏  举报