【算法】SimHash

package com.pachira.d;

import java.math.BigInteger;
/**
 * SamHash简介
 * 1.基本思想
 *  LSH: The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability;
 *  Hamming distance
 * 
 * 2.具体步骤
 *  a). 对于给定的一段语句,进行分词,得到有效的特征向量;
 *  b). 为每一个特征向量设置一个权值;
 *  c). 对每一个特征向量计算hash值,为01组成的n-bit签名;
 *  d). 所有特征向量进行加权(1则为正,0则为负),然后累加;
 *  e). 对于n-bit签名的累加结果,如果>0置1,否则置0;
 *  f). 得到该语句的simhash值;
 *  g). 根据不同语句simhash的Hamming Distance就来判断相似程度;
 * 
 * 3.算法优势
 *  用于比较大文本,如500字以上效果挺好,距离小于3的基本都是相似,误判率也比较低;
 *  ps: 对于短句或者标题,内容过少,离散性较强,并不足以判断相似性; 
 *      对于短句或者标题,可以尝试使用ED等(个人理解)
* jecard、cos@ * * 4.具体流程 * 中国人民解放军解放中国 * | * |分词 * | * --------- hash ------- weight --------------- * | 中 国 | --> | 10010 | ---> | 3 -3 -3 3 -3 | * | 人 民 | --> | 11001 | ---> | 2 2 -2 -2 2 | * | 解放军 | --> | 11100 | ---> | 5 5 5 -5 -5 | * | 解 放 | --> | 11011 | ---> | 2 2 -2 2 2 | * | 中 国 | --> | 10001 | ---> | 1 -1 -1 -1 1 | * --------- ------- --------------- * sign | * | 1 1 0 0 0 | <--- | 13 5 -3 -3 -3 | * * 5.具体实现:
*/ public class SimHash { /** * 计算初始的hash值 * @param source 要计算hash的特征字符串 * @param hashbits 指定特定位数的hash * @return 特征的hash值 */ public static BigInteger hash(String source, int hashbits) { if (source == null || source.length() == 0) { return new BigInteger("0"); } else { char[] sourceArray = source.toCharArray(); BigInteger hashcode = BigInteger.valueOf(((long) sourceArray[0]) << 7); BigInteger m = new BigInteger("1000003"); BigInteger mask = new BigInteger("2").pow(hashbits).subtract(new BigInteger("1")); for (char item : sourceArray) { BigInteger temp = BigInteger.valueOf((long) item); hashcode = hashcode.multiply(m).xor(temp).and(mask); } hashcode = hashcode.xor(new BigInteger(String.valueOf(source.length()))); if (hashcode.equals(new BigInteger("-1"))) { hashcode = new BigInteger("-2"); } return hashcode; } } /** * 更新向量的维度值信息 * @param hash 某一个特征的hash值 * @param features 基础向量 * @param weight 某一个特征的权重 */ public static void updatefeatures(BigInteger hash, int[] features, int weight) { for (int i = 0; i < features.length; i++) { BigInteger bitmask = new BigInteger("1").shiftLeft(i); /* *对某一特征的hash数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1, *中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕. */ if (hash.and(bitmask).signum() != 0) { //将该特征的信息更新到基础向量中,如果向量的第i维是大于0,+wegiht,否者-weight features[i] += weight; } else { features[i] -= weight; } } } /** * 最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名. * * @param features * @return */ public static String fingerprint(int[] features) { BigInteger fingerprint = new BigInteger("0"); StringBuffer simHashBuffer = new StringBuffer(); for (int i = 0; i < features.length; i++) { if (features[i] >= 0) { fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); simHashBuffer.append("1"); } else { simHashBuffer.append("0"); } } return simHashBuffer.toString(); } /** * hamming distance: 获得simHash串中同位置不同的数字的总数 * @param str1 simHash 1 * @param str2 simHash 2 * @return hamming distance */ public static int hammingDistance(String str1, String str2) { int distance; if (str1.length() != str2.length()) { distance = -1; } else { distance = 0; for (int i = 0; i < str1.length(); i++) { if (str1.charAt(i) != str2.charAt(i)) { distance++; } } } return distance; } public static void main(String[] args) { String s = "My thesis work focuses on large scale copy detection of digital objects such as textual documents"; String t = "My thesis job focuses on large scale copy detection of digital objects such as textual documents"; int hashbits = 64; //定义n-bit向量 int[] features = new int[hashbits]; int[] featuret = new int[hashbits]; //获得特征的hash值 BigInteger bs = hash(s, hashbits); BigInteger bt = hash(t, hashbits); //该处只是简单表示一个特征,如果对于一个文档一般会提取多个特征 updatefeatures(bs, features, 1); updatefeatures(bt, featuret, 1); //获得文档的指纹信息 String fingers = fingerprint(features); String fingert = fingerprint(featuret); //计算hamming distance int dis = hammingDistance(fingers, fingert); System.out.println(fingers + "\t" + s); System.out.println(fingert + "\t" + t); System.out.println(dis); } }

 

posted on 2014-12-12 14:07  有个姑娘叫小芳  阅读(357)  评论(0编辑  收藏  举报