SimHash去重

今天，学习了网页去重的一些方法，算法当中SimHash算法，我尤为关注。下面我将详细介绍一下这个算法

首先，介绍一下SimHash:如果两个相似文档的语义指纹只相差几个位或更少，这样的语义指纹叫做SimHash

计算海明距离的两种方法：

1，按位与

2，两个长整形异或后，然后计算结果中1的个数

取得每个特征的64位hash值

public static int hamming(long l1, long l2) {
long lxor = l1 ^ l2;
return BitUtil.pop(lxor);
}

SimHash计算过程：

初始化长度为64位的向量，该向量的每个维度都是0

循环处理：取每个特征的64位hash值，如果这个hash值得第i位是1，则将向量的第i个数加上特征权重，反之，如果为0，则减去相应的权重

完成所有特征的处理，向量中某些数为正，某些数为负，正数对应的位为1，负数为0，得到最终64位的SimHash

在写入文件过程中，可以把SimHash值使用差分编码进行压缩后保存，下面是一个简单实现代码

View Code

 1 package com.test;
 2 
 3 import java.io.BufferedInputStream;
 4 import java.io.BufferedOutputStream;
 5 import java.io.DataInputStream;
 6 import java.io.FileInputStream;
 7 import java.io.FileNotFoundException;
 8 import java.io.FileOutputStream;
 9 import java.io.IOException;
10 
11 public class DetaCompress {
12 
13     public static byte[] longToBytes(long n) {
14         byte[] buf = new byte[8];// 新建一个byte数组
15         for (int i = buf.length - 1; i >= 0; i--) {
16             buf[i] = (byte) (n & 0x00000000000000ff);// 取低8位的值
17             n >>>= 8;// 右移8位
18         }
19         return buf;
20     }
21 
22     // 把一个long型的数据进行压缩
23     public static void writeVLong(long i, BufferedOutputStream dos)
24             throws IOException {
25         while ((i & ~0x7F) != 0) {
26             dos.write((byte) ((i & 0x7f) | 0x80)); // 写入低位字节
27             i >>>= 7; // 右移7位
28         }
29 
30         dos.write((byte) i);
31         // System.out.println((byte)i+"    写入低位字节");
32 
33     }
34 
35     // 把一个压缩后的long型的数据读取出来
36     static long readVLong(DataInputStream dis) throws IOException {
37         byte b = dis.readByte(); // 读入一个字节
38         int i = b & 0x7F; // 取低7位的值
39         // 每个高位的字节多乘个2的7次方，也就是128
40         for (int shift = 7; (b & 0x80) != 0; shift += 7) {
41             if (dis.available() != 0) {
42                 b = dis.readByte();
43                 i |= (b & 0x7F) << shift; // 当前字节表示的位乘2的shift次方
44             }
45         }
46         return i;// 返回最终结果i
47     }
48 
49     // 把long型数组simHashSet写入fileName指定的文件中去
50     static int write(long[] simHashSet, String fileName) {
51         int j = 0;
52         try {
53             BufferedOutputStream dos = new BufferedOutputStream(
54                     new FileOutputStream(fileName));
55             byte[] b = longToBytes(simHashSet[0]);// 数组的第一个数字一个转换成二进制
56             dos.write(b);// 把它写到文件中
57             for (int i = 1; i < simHashSet.length; i++) {
58                 long lo = simHashSet[i] - simHashSet[i - 1];// 用一个变量记录数组中后一个数减前一个数的差
59                 writeVLong(lo, dos);// 把这个差值写入文件
60             }
61             dos.close();
62             j = simHashSet.length;
63         } catch (FileNotFoundException e) {
64             e.printStackTrace();
65         } catch (IOException e) {
66             e.printStackTrace();
67         }
68         return j;
69     }
70 
71     // 从fileName指定的文件中把long型数组写出来
72     static long[] read(int len, String fileName) {
73         try {
74             DataInputStream dis = new DataInputStream(new BufferedInputStream(
75                     new FileInputStream(fileName)));
76             long[] simHashSet = new long[len];
77             simHashSet[0] = dis.readLong();// 从文件读取第一个long型数字放入数组
78             for (int i = 1; i < len; i++) {
79                 simHashSet[i] = readVLong(dis);// 读取文件剩下的元素
80                 simHashSet[i] = simHashSet[i] + simHashSet[i - 1];  // 将元素都变成数组后一个数和前一个数字的和
81             }
82             dis.close();
83             
84             return simHashSet;
85         } catch (FileNotFoundException e) {
86             e.printStackTrace();
87         } catch (IOException e) {
88             e.printStackTrace();
89         }
90         return null;
91     }
92 }

排重的总体思想是：

先把要检索的f 位指纹集合缩小，将集合f位划分几块，

精确匹配高d位，集合容量缩小变为f'=|s|/2^d'

然后在小集合中检索f-d'位的海明距离

下面是实现的例子：

View Code

  1 package com.lietu.simhash;
  2 
  3 import java.io.BufferedReader;
  4 import java.io.BufferedWriter;
  5 import java.io.File;
  6 import java.io.FileInputStream;
  7 import java.io.FileNotFoundException;
  8 import java.io.FileOutputStream;
  9 import java.io.FileWriter;
 10 import java.io.IOException;
 11 import java.io.InputStream;
 12 import java.io.InputStreamReader;
 13 import java.io.OutputStream;
 14 import java.io.OutputStreamWriter;
 15 import java.io.UnsupportedEncodingException;
 16 import java.util.ArrayList;
 17 import java.util.Collections;
 18 import java.util.Comparator;
 19 import java.util.HashMap;
 20 import java.util.HashSet;
 21 import java.util.Iterator;
 22 import java.util.StringTokenizer;
 23 import java.util.Map.Entry;
 24 
 25 /**
 26  * 64位分四块，最多找出有3位差别的simhash
 27  * 
 28  * @author lg
 29  * 
 30  */
 31 // TODO: 保存排序后的中间状态
 32 public class SimHashSet4 implements Iterable<SimHashData> {
 33     ArrayList<SimHashData> t1 = new ArrayList<SimHashData>();
 34     ArrayList<SimHashData> t2 = new ArrayList<SimHashData>();
 35     ArrayList<SimHashData> t3 = new ArrayList<SimHashData>();
 36     ArrayList<SimHashData> t4 = new ArrayList<SimHashData>();
 37 
 38     public ArrayList<SimHashData> getT1(){
 39         return t1;
 40     }
 41     static Comparator<SimHashData> comp = new Comparator<SimHashData>() {
 42         public int compare(SimHashData o1, SimHashData o2) {
 43             if (o1.q == o2.q)
 44                 return 0;
 45             return (isLessThanUnsigned(o1.q, o2.q)) ? 1 : -1;
 46         }
 47     }; // 比较无符号64位
 48     static Comparator<Long> compHigh = new Comparator<Long>() {
 49         public int compare(Long o1, Long o2) {
 50             o1 |= 0xFFFFFFFFFFFFL;
 51             o2 |= 0xFFFFFFFFFFFFL;
 52             // System.out.println(Long.toBinaryString(o1));
 53             // System.out.println(Long.toBinaryString(o2));
 54             // System.out.println((o1 == o2));
 55             if (o1.equals(o2))
 56                 return 0;
 57             return (isLessThanUnsigned(o1, o2)) ? 1 : -1;
 58         }
 59     }; // 比较无符号64位中的高16位
 60 
 61     public void load(String fileName) {
 62         String line = null;
 63 
 64         try {
 65             InputStream is = new FileInputStream(new File(fileName));
 66 
 67             BufferedReader br = new BufferedReader(new InputStreamReader(is));
 68 
 69             while ((line = br.readLine()) != null) {
 70                 addSimHash(line.trim());
 71             }
 72             br.close();
 73 
 74         } catch (FileNotFoundException e) {
 75             e.printStackTrace();
 76         } catch (UnsupportedEncodingException e) {
 77             e.printStackTrace();
 78         } catch (IOException e) {
 79             e.printStackTrace();
 80         }
 81     }
 82 
 83     public static boolean isLessThanUnsigned(long n1, long n2) {
 84         return (n1 < n2) ^ ((n1 < 0) != (n2 < 0));
 85     }
 86 
 87     public void sort() {
 88         t2.clear();
 89         t3.clear();
 90         t4.clear();
 91         for (SimHashData simHash : t1) 
 92         {
 93             long t = Long.rotateLeft(simHash.q, 16);
 94             t2.add(new SimHashData(t, simHash.no));
 95 
 96             t = Long.rotateLeft(t, 16);
 97             t3.add(new SimHashData(t, simHash.no));
 98 
 99             t = Long.rotateLeft(t, 16);
100             t4.add(new SimHashData(t, simHash.no));
101         }
102 
103         Collections.sort(t1, comp);
104         Collections.sort(t2, comp);
105         Collections.sort(t3, comp);
106         Collections.sort(t4, comp);
107     }
108 
109     public boolean contains(SimHashData key) {
110         int low = 0;
111         int high = t1.size() - 1;
112 
113         while (low <= high) {
114             int mid = (low + high) >>> 1;
115             SimHashData midVal = t1.get(mid);
116             int cmp = comp.compare(midVal, key);
117 
118             if (cmp < 0)
119                 low = mid + 1;
120             else if (cmp > 0)
121                 high = mid - 1;
122             else
123                 return true; // key found
124         }
125         return false; // key not found
126     }
127 
128     /**
129      * probe exact match
130      * 
131      * @param t
132      * @return
133      */
134     public Span probe(ArrayList<SimHashData> t, long key) {
135         // System.out.println("t:"+t.size());
136         int low = 0;
137         int high = t.size() - 1;
138 
139         while (low <= high) {
140             int mid = (low + high) >>> 1;
141             Long midVal = t.get(mid).q;
142             int cmp = compHigh.compare(midVal, key);
143 
144             if (cmp < 0)
145                 low = mid + 1;
146             else if (cmp > 0)
147                 high = mid - 1;
148             else {
149                 // key found
150                 int matchStart = mid;
151                 int matchEnd = mid;
152                 while (matchStart > 0) {
153                     midVal = t.get(matchStart - 1).q;
154                     if (compHigh.compare(midVal, key) == 0) {
155                         --matchStart;
156                     } else {
157                         break;
158                     }
159                 }
160 
161                 while (matchEnd < (t.size() - 1)) {
162                     midVal = t.get(matchEnd + 1).q;
163                     if (compHigh.compare(midVal, key) == 0) {
164                         ++matchEnd;
165                     } else {
166                         break;
167                     }
168                 }
169                 return new Span(matchStart, matchEnd);
170             }
171         }
172         return null; // key not found
173     }
174 
175     /**
176      * get most 3 bit difference.
177      * 
178      * @param fingerPrint
179      * @param k
180      * @return
181      */
182     public HashSet<SimHashData> getSimSet(long fingerPrint, int k) {
183 
184         HashSet<SimHashData> retAll = new HashSet<SimHashData>();
185         Span s1 = probe(t1, fingerPrint);
186         if (s1 != null) {
187             // System.out.println("s1:"+s1);
188             ArrayList<SimHashData> ret1 = getSim(t1, s1, fingerPrint, k);
189             retAll.addAll(ret1);
190         }
191         long q2 = Long.rotateLeft(fingerPrint, 16);
192         Span s2 = probe(t2, q2);
193         if (s2 != null) {
194             // System.out.println("s2:"+s2);
195             ArrayList<SimHashData> ret2 = getSim(t2, s2, q2, k);
196             // rotateRight(ret2, 16);
197             retAll.addAll(ret2);
198         }
199 
200         long q3 = Long.rotateLeft(q2, 16);
201         Span s3 = probe(t3, q3);
202         if (s3 != null) {
203             // System.out.println("s3:"+s3);
204             ArrayList<SimHashData> ret3 = getSim(t3, s3, q3, k);
205             // rotateRight(ret3, 32);
206             retAll.addAll(ret3);
207         }
208 
209         long q4 = Long.rotateLeft(q3, 16);
210         Span s4 = probe(t4, q4);
211         if (s4 != null) {
212         //    System.out.println("s4:" + s4);
213             ArrayList<SimHashData> ret4 = getSim(t4, s4, q4, k);
214             // rotateRight(ret4, 48);
215             retAll.addAll(ret4);
216         }
217         // System.out.println("o:"+Long.toBinaryString(fingerPrint));
218         return retAll;
219     }
220 
221     /**
222      * 从Span找出部分相等的，取出最多差k位的
223      * 
224      * @param t
225      * @param s
226      * @param fingerPrint
227      * @param k
228      * @return
229      */
230     public ArrayList<SimHashData> getSim(ArrayList<SimHashData> t, Span s,
231             long fingerPrint, int k) {
232         ArrayList<SimHashData> result = new ArrayList<SimHashData>();
233 
234         for (int i = s.getStart(); i <= s.getEnd(); ++i) {
235             SimHashData data = t.get(i);
236             long q = data.q;
237             if (BitUtil.diffIn(fingerPrint, q, k)) {
238                 result.add(data);
239             }
240         }
241 
242         return result;
243     }
244 
245     public void addSimHash(String line) {
246         StringTokenizer st = new StringTokenizer(line, ":");
247         String key = st.nextToken();
248         long t = BitUtil.decodeLong(key);
249         long no = Long.parseLong(st.nextToken());
250         // Long.parseLong(key,2);
251         // System.out.println(t);
252         t1.add(new SimHashData(t, no));
253     }
254     
255     public void addSimHash(SimHashData key) {
256         t1.add(key);
257     }
258 
259     public void addInc(String key) {
260         long t = BitUtil.decodeLong(key);
261         // Long.parseLong(key,2);
262         // System.out.println(t);
263         SimHashData element = new SimHashData(t);
264         int insertionPoint = findInsertionPoint(t1, element);
265         t1.add(insertionPoint, element);
266 
267         long q2 = Long.rotateLeft(t, 16);
268         element = new SimHashData(q2);
269         insertionPoint = findInsertionPoint(t2, element);
270         t2.add(insertionPoint, element);
271 
272         long q3 = Long.rotateLeft(q2, 16);
273         element = new SimHashData(q3);
274         insertionPoint = findInsertionPoint(t3, element);
275         t3.add(insertionPoint, element);
276 
277         long q4 = Long.rotateLeft(q3, 16);
278         element = new SimHashData(q4);
279         insertionPoint = findInsertionPoint(t4, element);
280         t4.add(insertionPoint, element);
281     }
282 
283     /**
284      * Find the insertion point for the argument in a sorted list.
285      * 
286      * @param element
287      *            find this object's insertion point in the sorted list
288      * @return the index of the insertion point
289      */
290     int findInsertionPoint(ArrayList<SimHashData> list, SimHashData element) {
291         // Find the new element's insertion point.
292         int insertionPoint = Collections.binarySearch(list, element, comp);
293         if (insertionPoint < 0) {
294             insertionPoint = -(insertionPoint + 1);
295         }
296         return insertionPoint;
297     }
298 
299     public Iterator<SimHashData> iterator() {
300         return t1.iterator();
301     }
302 
303     public void save(String fileName) {
304         BufferedWriter writer;
305         try {
306             writer = new BufferedWriter(new FileWriter(fileName));
307             for (SimHashData simhash : t1) {
308                 //String str=BitUtil.encodeLong(simhash.q).substring(8);
309                 String str=BitUtil.encodeLong(simhash.q);
310                 writer.write(str);
311 //                writer.write(simhash.q+"");
312                 writer.write(":");
313                 writer.write(String.valueOf(simhash.no));
314                 writer.write("\r\n");
315             }
316             writer.flush();
317             writer.close();
318         } catch (Exception e) {
319             e.printStackTrace();
320         }
321     }
322 
323     public void save(String fileName, String[] newStr) {
324         BufferedWriter writer;
325         try {
326             OutputStream out = new FileOutputStream(fileName, true);
327             OutputStreamWriter outWriter = new OutputStreamWriter(out);
328             writer = new BufferedWriter(outWriter);
329             for (int i = 0; i < newStr.length; i++) {
330                 if (newStr[i] != null) {
331                     writer.append(newStr[i]);
332                     writer.append("\r\n");
333                     if (i % 10000 == 0)
334                         System.out.println(i + ":" + newStr[i]);
335                 } else {
336                     break;
337                 }
338             }
339             writer.flush();
340             writer.close();
341             System.out.println("结束!");
342         } catch (Exception e) {
343             e.printStackTrace();
344         }
345     }
346 
347     // 将数据读成SimHashData对象型集合
348     public ArrayList<SimHashData> readData(String path) {
349         ArrayList<SimHashData> list = new ArrayList<SimHashData>();
350 
351         try {
352             InputStream input = new FileInputStream(new File(path));
353             BufferedReader br = new BufferedReader(new InputStreamReader(input));
354             String line = "";
355             while ((line = br.readLine()) != null) {
356                 StringTokenizer st = new StringTokenizer(line, ":");
357                 long key = BitUtil.decodeLong(st.nextToken());
358                 long no = Long.parseLong(st.nextToken());
359                 list.add(new SimHashData(key, no));
360             }
361             br.close();
362         } catch (FileNotFoundException e) {
363             e.printStackTrace();
364         } catch (IOException e) {
365             e.printStackTrace();
366         }
367         return list;
368     }
369 
370 
371 
372 }

介绍一篇论文：Google Detecting NearDuplicates For Web Crawling 论文介绍了把SimHash用于爬虫抓取过程的网页去重。

最后，说一下分布式文档排重：利用分布式系统框架如hadoop等，使用MapReduce进行文档排重，提高了效率和节省了时间，这已经成为了常用的大数据量的排重方式

以上，是我对SimHash的一些总结，请大家指教！大家共勉

posted on 2012-11-15 18:45 woyuchengxian 阅读(1449) 评论(1) 编辑收藏举报

刷新页面返回顶部

woyuchengxian

SimHash去重

导航

公告