Python用哈希算法查找相似图片(包括不同分辨率,不同大小,不同格式的图片)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# -*- coding: utf-8 -*-
'''
Python用哈希算法查找相似图片并放入[_df]的文件夹中
 
相似图片包括不同分辨率,不同大小,不同格式,只要图片相似就会算重复文件
 
 
安装cv2
pip install opencv-python
 
'''
import os
import cv2
import numpy as np
import shutil
import random
 
class DuplicateFiles (object):
    dir = ''
    def __init__(self, dir):
        self.dir = dir  # 实例属性
 
    # 均值哈希算法
    def aHash(self,img,shape=(10,10)):
        # 缩放为10*10
        img = cv2.resize(img, shape)
        # 转换为灰度图
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # s为像素和初值为0,hash_str为hash值初值为''
        s = 0
        hash_str = ''
        # 遍历累加求像素和
        for i in range(shape[0]):
            for j in range(shape[1]):
                s = s + gray[i, j]
        # 求平均灰度
        avg = s / 100
        # 灰度大于平均值为1相反为0生成图片的hash值
        for i in range(shape[0]):
            for j in range(shape[1]):
                if gray[i, j] > avg:
                    hash_str = hash_str + '1'
                else:
                    hash_str = hash_str + '0'
        return hash_str
 
    # 差值感知算法
    def dHash(self,img,shape=(10,10)):
        # 缩放10*11
        img = cv2.resize(img, (shape[0]+1, shape[1]))
        # 转换灰度图
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        hash_str = ''
        # 每行前一个像素大于后一个像素为1,相反为0,生成哈希
        for i in range(shape[0]):
            for j in range(shape[1]):
                if gray[i, j] > gray[i, j + 1]:
                    hash_str = hash_str + '1'
                else:
                    hash_str = hash_str + '0'
        return hash_str
 
    # 感知哈希算法(pHash)
    def pHash(self,img,shape=(10,10)):
        # 缩放32*32
        img = cv2.resize(img, (32, 32))  # , interpolation=cv2.INTER_CUBIC
 
        # 转换为灰度图
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # 将灰度图转为浮点型,再进行dct变换
        dct = cv2.dct(np.float32(gray))
        # opencv实现的掩码操作
        dct_roi = dct[0:10, 0:10]
 
        hash = []
        avreage = np.mean(dct_roi)
        for i in range(dct_roi.shape[0]):
            for j in range(dct_roi.shape[1]):
                if dct_roi[i, j] > avreage:
                    hash.append(1)
                else:
                    hash.append(0)
        return hash
 
    # 通过得到RGB每个通道的直方图来计算相似度
    def classify_hist_with_split(self,image1, image2, size=(256, 256)):
        # 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
        image1 = cv2.resize(image1, size)
        image2 = cv2.resize(image2, size)
        sub_image1 = cv2.split(image1)
        sub_image2 = cv2.split(image2)
        sub_data = 0
        for im1, im2 in zip(sub_image1, sub_image2):
            sub_data += self.calculate(im1, im2)
        sub_data = sub_data / 3
        return sub_data
 
    # 计算单通道的直方图的相似值
    def calculate(self,image1, image2):
        hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
        hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
        # 计算直方图的重合度
        degree = 0
        for i in range(len(hist1)):
            if hist1[i] != hist2[i]:
                degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
            else:
                degree = degree + 1
        degree = degree / len(hist1)
        return degree
 
    # Hash值对比
    def cmpHash(self,hash1, hash2,shape=(10,10)):
        n = 0
        # hash长度不同则返回-1代表传参出错
        if len(hash1)!=len(hash2):
            return -1
        # 遍历判断
        for i in range(len(hash1)):
            # 相等则n计数+1,n最终为相似度
            if hash1[i] == hash2[i]:
                n = n + 1
        return n/(shape[0]*shape[1])
 
    def mymovefile(self,srcfile,dstpath,ffname):           # 移动函数
        if not os.path.isfile(srcfile):
            print ("%s not exist!"%(srcfile))
        else:
            fpath,fname=os.path.split(srcfile)             # 分离文件名和路径
            if(ffname):fname=ffname
            if not os.path.exists(dstpath):
                os.makedirs(dstpath)                       # 创建路径
            shutil.move(srcfile, dstpath + fname)          # 移动文件
            #print ("move %s -> %s"%(srcfile, dstpath + fname))
 
    # 定义函数
    def list_all_files(self,rootdir):
        _files = []
        # 列出文件夹下所有的目录与文件
        list = os.listdir(rootdir)
        for i in range(0, len(list)):
            # 构造路径
            path = os.path.join(rootdir, list[i])
            # 判断路径是否为文件目录或者文件
            # 如果是目录则继续递归
            if os.path.isdir(path):
                _files.extend(list_all_files(path))
            if os.path.isfile(path):
                _files.append(path)
        return _files
 
    #处理文件
    def mvPhoto(self):
         
        photoList = self.list_all_files(self.dir)
        #print(photoList)
 
        for i,photo in enumerate(photoList):
            mvPhoto = False #是否移动主文件
            #如果不是文件则跳出
            if(not os.path.isfile(photo)):
                continue
            fpath,fname=os.path.split(photo)
            print('Master:'+fname)
            ffname = fname.split('.')
 
            #不是下列文件形式跳出
            if(ffname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'gif'}):
                continue
 
            img1 = cv2.imdecode(np.fromfile(photo,dtype=np.uint8),cv2.IMREAD_COLOR)
            for j in range(i+1,len(photoList)):
                #print('  ',j,photoList[j])
                if(not os.path.isfile(photo) or not os.path.isfile(photoList[j])):
                    continue
                spath,sname=os.path.split(photoList[j])
                #print(sname)
                ssname = sname.split('.')
                if(ssname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'jfif'}):
                    continue
                 
                #img1 = cv2.imread(photo)
                img2 = cv2.imdecode(np.fromfile(photoList[j],dtype=np.uint8),cv2.IMREAD_COLOR)
                 
                #hash1 = aHash(img1)
                #hash2 = aHash(img2)
                n1 = self.cmpHash(self.aHash(img1), self.aHash(img2))
                n2 = self.cmpHash(self.dHash(img1), self.dHash(img2))
                n3 = self.cmpHash(self.pHash(img1), self.pHash(img2))
                n4 = self.classify_hist_with_split(img1, img2)
                n5 = self.calculate(img1, img2)
                #print('    ',n1,n2,n3,n4,n5)
                if(n1>0.90 or n2>0.90 or n3>0.90 or n4>0.90 or n5>0.90):
                    mvPhoto = True
                    print('    move file:'+photoList[j])
                    if(os.path.isfile(photoList[j])):
                        print('ffname[0]:'+ffname[0])
                        #mymovefile(photoList[j],dir+'_重复'+'/',ffname[0]+'_'+str(random.randint(10,99))+'.'+ffname[1])
                        self.mymovefile(photoList[j],dir+'_df'+'/',ffname[0]+'_'+sname)
             
            #最后移动主文件
            if(mvPhoto==True):   
                self.mymovefile(photo,dir+'_df'+'/',fname)
 
if __name__ == "__main__":
    #指定路径
    #dir = r'E:\python\photoCompare\328' #指定目录地址
    dir = os.getcwd()                    #当前文件所在目录
    duplicateFiles = DuplicateFiles(dir)
    duplicateFiles.mvPhoto()

  

posted @   卡卡之海  阅读(228)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2014-04-21 cocos2d-x场景间参数传递
点击右上角即可分享
微信分享提示