SimHash去重
今天,学习了网页去重的一些方法,算法当中SimHash算法,我尤为关注。下面我将详细介绍一下这个算法
首先,介绍一下SimHash:如果两个相似文档的语义指纹只相差几个位或更少,这样的语义指纹叫做SimHash
计算海明距离的两种方法:
1,按位与
2,两个长整形异或后,然后计算结果中1的个数
取得每个特征的64位hash值
public static int hamming(long l1, long l2) {
long lxor = l1 ^ l2;
return BitUtil.pop(lxor);
}
long lxor = l1 ^ l2;
return BitUtil.pop(lxor);
}
SimHash计算过程:
初始化长度为64位的向量,该向量的每个维度都是0
循环处理:取每个特征的64位hash值,如果这个hash值得第i位是1,则将向量的第i个数加上特征权重,反之,如果为0,则减去相应的权重
完成所有特征的处理,向量中某些数为正,某些数为负,正数对应的位为1,负数为0,得到最终64位的SimHash
在写入文件过程中,可以把SimHash值使用差分编码进行压缩后保存,下面是一个简单实现代码
View Code
1 package com.test; 2 3 import java.io.BufferedInputStream; 4 import java.io.BufferedOutputStream; 5 import java.io.DataInputStream; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.IOException; 10 11 public class DetaCompress { 12 13 public static byte[] longToBytes(long n) { 14 byte[] buf = new byte[8];// 新建一个byte数组 15 for (int i = buf.length - 1; i >= 0; i--) { 16 buf[i] = (byte) (n & 0x00000000000000ff);// 取低8位的值 17 n >>>= 8;// 右移8位 18 } 19 return buf; 20 } 21 22 // 把一个long型的数据进行压缩 23 public static void writeVLong(long i, BufferedOutputStream dos) 24 throws IOException { 25 while ((i & ~0x7F) != 0) { 26 dos.write((byte) ((i & 0x7f) | 0x80)); // 写入低位字节 27 i >>>= 7; // 右移7位 28 } 29 30 dos.write((byte) i); 31 // System.out.println((byte)i+" 写入低位字节"); 32 33 } 34 35 // 把一个压缩后的long型的数据读取出来 36 static long readVLong(DataInputStream dis) throws IOException { 37 byte b = dis.readByte(); // 读入一个字节 38 int i = b & 0x7F; // 取低7位的值 39 // 每个高位的字节多乘个2的7次方,也就是128 40 for (int shift = 7; (b & 0x80) != 0; shift += 7) { 41 if (dis.available() != 0) { 42 b = dis.readByte(); 43 i |= (b & 0x7F) << shift; // 当前字节表示的位乘2的shift次方 44 } 45 } 46 return i;// 返回最终结果i 47 } 48 49 // 把long型数组simHashSet写入fileName指定的文件中去 50 static int write(long[] simHashSet, String fileName) { 51 int j = 0; 52 try { 53 BufferedOutputStream dos = new BufferedOutputStream( 54 new FileOutputStream(fileName)); 55 byte[] b = longToBytes(simHashSet[0]);// 数组的第一个数字一个转换成二进制 56 dos.write(b);// 把它写到文件中 57 for (int i = 1; i < simHashSet.length; i++) { 58 long lo = simHashSet[i] - simHashSet[i - 1];// 用一个变量记录数组中后一个数减前一个数的差 59 writeVLong(lo, dos);// 把这个差值写入文件 60 } 61 dos.close(); 62 j = simHashSet.length; 63 } catch (FileNotFoundException e) { 64 e.printStackTrace(); 65 } catch (IOException e) { 66 e.printStackTrace(); 67 } 68 return j; 69 } 70 71 // 从fileName指定的文件中把long型数组写出来 72 static long[] read(int len, String fileName) { 73 try { 74 DataInputStream dis = new DataInputStream(new BufferedInputStream( 75 new FileInputStream(fileName))); 76 long[] simHashSet = new long[len]; 77 simHashSet[0] = dis.readLong();// 从文件读取第一个long型数字放入数组 78 for (int i = 1; i < len; i++) { 79 simHashSet[i] = readVLong(dis);// 读取文件剩下的元素 80 simHashSet[i] = simHashSet[i] + simHashSet[i - 1]; // 将元素都变成数组后一个数和前一个数字的和 81 } 82 dis.close(); 83 84 return simHashSet; 85 } catch (FileNotFoundException e) { 86 e.printStackTrace(); 87 } catch (IOException e) { 88 e.printStackTrace(); 89 } 90 return null; 91 } 92 }
排重的总体思想是:
先把要检索的f 位指纹集合缩小,将集合f位划分几块,
精确匹配高d位,集合容量缩小变为f'=|s|/2^d'
然后在小集合中检索f-d'位的海明距离
下面是实现的例子:
View Code
1 package com.lietu.simhash;
2
3 import java.io.BufferedReader;
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.FileNotFoundException;
8 import java.io.FileOutputStream;
9 import java.io.FileWriter;
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.io.InputStreamReader;
13 import java.io.OutputStream;
14 import java.io.OutputStreamWriter;
15 import java.io.UnsupportedEncodingException;
16 import java.util.ArrayList;
17 import java.util.Collections;
18 import java.util.Comparator;
19 import java.util.HashMap;
20 import java.util.HashSet;
21 import java.util.Iterator;
22 import java.util.StringTokenizer;
23 import java.util.Map.Entry;
24
25 /**
26 * 64位分四块,最多找出有3位差别的simhash
27 *
28 * @author lg
29 *
30 */
31 // TODO: 保存排序后的中间状态
32 public class SimHashSet4 implements Iterable<SimHashData> {
33 ArrayList<SimHashData> t1 = new ArrayList<SimHashData>();
34 ArrayList<SimHashData> t2 = new ArrayList<SimHashData>();
35 ArrayList<SimHashData> t3 = new ArrayList<SimHashData>();
36 ArrayList<SimHashData> t4 = new ArrayList<SimHashData>();
37
38 public ArrayList<SimHashData> getT1(){
39 return t1;
40 }
41 static Comparator<SimHashData> comp = new Comparator<SimHashData>() {
42 public int compare(SimHashData o1, SimHashData o2) {
43 if (o1.q == o2.q)
44 return 0;
45 return (isLessThanUnsigned(o1.q, o2.q)) ? 1 : -1;
46 }
47 }; // 比较无符号64位
48 static Comparator<Long> compHigh = new Comparator<Long>() {
49 public int compare(Long o1, Long o2) {
50 o1 |= 0xFFFFFFFFFFFFL;
51 o2 |= 0xFFFFFFFFFFFFL;
52 // System.out.println(Long.toBinaryString(o1));
53 // System.out.println(Long.toBinaryString(o2));
54 // System.out.println((o1 == o2));
55 if (o1.equals(o2))
56 return 0;
57 return (isLessThanUnsigned(o1, o2)) ? 1 : -1;
58 }
59 }; // 比较无符号64位中的高16位
60
61 public void load(String fileName) {
62 String line = null;
63
64 try {
65 InputStream is = new FileInputStream(new File(fileName));
66
67 BufferedReader br = new BufferedReader(new InputStreamReader(is));
68
69 while ((line = br.readLine()) != null) {
70 addSimHash(line.trim());
71 }
72 br.close();
73
74 } catch (FileNotFoundException e) {
75 e.printStackTrace();
76 } catch (UnsupportedEncodingException e) {
77 e.printStackTrace();
78 } catch (IOException e) {
79 e.printStackTrace();
80 }
81 }
82
83 public static boolean isLessThanUnsigned(long n1, long n2) {
84 return (n1 < n2) ^ ((n1 < 0) != (n2 < 0));
85 }
86
87 public void sort() {
88 t2.clear();
89 t3.clear();
90 t4.clear();
91 for (SimHashData simHash : t1)
92 {
93 long t = Long.rotateLeft(simHash.q, 16);
94 t2.add(new SimHashData(t, simHash.no));
95
96 t = Long.rotateLeft(t, 16);
97 t3.add(new SimHashData(t, simHash.no));
98
99 t = Long.rotateLeft(t, 16);
100 t4.add(new SimHashData(t, simHash.no));
101 }
102
103 Collections.sort(t1, comp);
104 Collections.sort(t2, comp);
105 Collections.sort(t3, comp);
106 Collections.sort(t4, comp);
107 }
108
109 public boolean contains(SimHashData key) {
110 int low = 0;
111 int high = t1.size() - 1;
112
113 while (low <= high) {
114 int mid = (low + high) >>> 1;
115 SimHashData midVal = t1.get(mid);
116 int cmp = comp.compare(midVal, key);
117
118 if (cmp < 0)
119 low = mid + 1;
120 else if (cmp > 0)
121 high = mid - 1;
122 else
123 return true; // key found
124 }
125 return false; // key not found
126 }
127
128 /**
129 * probe exact match
130 *
131 * @param t
132 * @return
133 */
134 public Span probe(ArrayList<SimHashData> t, long key) {
135 // System.out.println("t:"+t.size());
136 int low = 0;
137 int high = t.size() - 1;
138
139 while (low <= high) {
140 int mid = (low + high) >>> 1;
141 Long midVal = t.get(mid).q;
142 int cmp = compHigh.compare(midVal, key);
143
144 if (cmp < 0)
145 low = mid + 1;
146 else if (cmp > 0)
147 high = mid - 1;
148 else {
149 // key found
150 int matchStart = mid;
151 int matchEnd = mid;
152 while (matchStart > 0) {
153 midVal = t.get(matchStart - 1).q;
154 if (compHigh.compare(midVal, key) == 0) {
155 --matchStart;
156 } else {
157 break;
158 }
159 }
160
161 while (matchEnd < (t.size() - 1)) {
162 midVal = t.get(matchEnd + 1).q;
163 if (compHigh.compare(midVal, key) == 0) {
164 ++matchEnd;
165 } else {
166 break;
167 }
168 }
169 return new Span(matchStart, matchEnd);
170 }
171 }
172 return null; // key not found
173 }
174
175 /**
176 * get most 3 bit difference.
177 *
178 * @param fingerPrint
179 * @param k
180 * @return
181 */
182 public HashSet<SimHashData> getSimSet(long fingerPrint, int k) {
183
184 HashSet<SimHashData> retAll = new HashSet<SimHashData>();
185 Span s1 = probe(t1, fingerPrint);
186 if (s1 != null) {
187 // System.out.println("s1:"+s1);
188 ArrayList<SimHashData> ret1 = getSim(t1, s1, fingerPrint, k);
189 retAll.addAll(ret1);
190 }
191 long q2 = Long.rotateLeft(fingerPrint, 16);
192 Span s2 = probe(t2, q2);
193 if (s2 != null) {
194 // System.out.println("s2:"+s2);
195 ArrayList<SimHashData> ret2 = getSim(t2, s2, q2, k);
196 // rotateRight(ret2, 16);
197 retAll.addAll(ret2);
198 }
199
200 long q3 = Long.rotateLeft(q2, 16);
201 Span s3 = probe(t3, q3);
202 if (s3 != null) {
203 // System.out.println("s3:"+s3);
204 ArrayList<SimHashData> ret3 = getSim(t3, s3, q3, k);
205 // rotateRight(ret3, 32);
206 retAll.addAll(ret3);
207 }
208
209 long q4 = Long.rotateLeft(q3, 16);
210 Span s4 = probe(t4, q4);
211 if (s4 != null) {
212 // System.out.println("s4:" + s4);
213 ArrayList<SimHashData> ret4 = getSim(t4, s4, q4, k);
214 // rotateRight(ret4, 48);
215 retAll.addAll(ret4);
216 }
217 // System.out.println("o:"+Long.toBinaryString(fingerPrint));
218 return retAll;
219 }
220
221 /**
222 * 从Span找出部分相等的,取出最多差k位的
223 *
224 * @param t
225 * @param s
226 * @param fingerPrint
227 * @param k
228 * @return
229 */
230 public ArrayList<SimHashData> getSim(ArrayList<SimHashData> t, Span s,
231 long fingerPrint, int k) {
232 ArrayList<SimHashData> result = new ArrayList<SimHashData>();
233
234 for (int i = s.getStart(); i <= s.getEnd(); ++i) {
235 SimHashData data = t.get(i);
236 long q = data.q;
237 if (BitUtil.diffIn(fingerPrint, q, k)) {
238 result.add(data);
239 }
240 }
241
242 return result;
243 }
244
245 public void addSimHash(String line) {
246 StringTokenizer st = new StringTokenizer(line, ":");
247 String key = st.nextToken();
248 long t = BitUtil.decodeLong(key);
249 long no = Long.parseLong(st.nextToken());
250 // Long.parseLong(key,2);
251 // System.out.println(t);
252 t1.add(new SimHashData(t, no));
253 }
254
255 public void addSimHash(SimHashData key) {
256 t1.add(key);
257 }
258
259 public void addInc(String key) {
260 long t = BitUtil.decodeLong(key);
261 // Long.parseLong(key,2);
262 // System.out.println(t);
263 SimHashData element = new SimHashData(t);
264 int insertionPoint = findInsertionPoint(t1, element);
265 t1.add(insertionPoint, element);
266
267 long q2 = Long.rotateLeft(t, 16);
268 element = new SimHashData(q2);
269 insertionPoint = findInsertionPoint(t2, element);
270 t2.add(insertionPoint, element);
271
272 long q3 = Long.rotateLeft(q2, 16);
273 element = new SimHashData(q3);
274 insertionPoint = findInsertionPoint(t3, element);
275 t3.add(insertionPoint, element);
276
277 long q4 = Long.rotateLeft(q3, 16);
278 element = new SimHashData(q4);
279 insertionPoint = findInsertionPoint(t4, element);
280 t4.add(insertionPoint, element);
281 }
282
283 /**
284 * Find the insertion point for the argument in a sorted list.
285 *
286 * @param element
287 * find this object's insertion point in the sorted list
288 * @return the index of the insertion point
289 */
290 int findInsertionPoint(ArrayList<SimHashData> list, SimHashData element) {
291 // Find the new element's insertion point.
292 int insertionPoint = Collections.binarySearch(list, element, comp);
293 if (insertionPoint < 0) {
294 insertionPoint = -(insertionPoint + 1);
295 }
296 return insertionPoint;
297 }
298
299 public Iterator<SimHashData> iterator() {
300 return t1.iterator();
301 }
302
303 public void save(String fileName) {
304 BufferedWriter writer;
305 try {
306 writer = new BufferedWriter(new FileWriter(fileName));
307 for (SimHashData simhash : t1) {
308 //String str=BitUtil.encodeLong(simhash.q).substring(8);
309 String str=BitUtil.encodeLong(simhash.q);
310 writer.write(str);
311 // writer.write(simhash.q+"");
312 writer.write(":");
313 writer.write(String.valueOf(simhash.no));
314 writer.write("\r\n");
315 }
316 writer.flush();
317 writer.close();
318 } catch (Exception e) {
319 e.printStackTrace();
320 }
321 }
322
323 public void save(String fileName, String[] newStr) {
324 BufferedWriter writer;
325 try {
326 OutputStream out = new FileOutputStream(fileName, true);
327 OutputStreamWriter outWriter = new OutputStreamWriter(out);
328 writer = new BufferedWriter(outWriter);
329 for (int i = 0; i < newStr.length; i++) {
330 if (newStr[i] != null) {
331 writer.append(newStr[i]);
332 writer.append("\r\n");
333 if (i % 10000 == 0)
334 System.out.println(i + ":" + newStr[i]);
335 } else {
336 break;
337 }
338 }
339 writer.flush();
340 writer.close();
341 System.out.println("结束!");
342 } catch (Exception e) {
343 e.printStackTrace();
344 }
345 }
346
347 // 将数据读成SimHashData对象型集合
348 public ArrayList<SimHashData> readData(String path) {
349 ArrayList<SimHashData> list = new ArrayList<SimHashData>();
350
351 try {
352 InputStream input = new FileInputStream(new File(path));
353 BufferedReader br = new BufferedReader(new InputStreamReader(input));
354 String line = "";
355 while ((line = br.readLine()) != null) {
356 StringTokenizer st = new StringTokenizer(line, ":");
357 long key = BitUtil.decodeLong(st.nextToken());
358 long no = Long.parseLong(st.nextToken());
359 list.add(new SimHashData(key, no));
360 }
361 br.close();
362 } catch (FileNotFoundException e) {
363 e.printStackTrace();
364 } catch (IOException e) {
365 e.printStackTrace();
366 }
367 return list;
368 }
369
370
371
372 }
介绍一篇论文:Google Detecting NearDuplicates For Web Crawling 论文介绍了把SimHash用于爬虫抓取过程的网页去重。
最后,说一下分布式文档排重:利用分布式系统框架如hadoop等,使用MapReduce进行文档排重,提高了效率和节省了时间,这已经成为了常用的大数据量的排重方式
以上,是我对SimHash的一些总结,请大家指教!大家共勉
posted on 2012-11-15 18:45 woyuchengxian 阅读(1448) 评论(1) 编辑 收藏 举报