附注:用BigInteger类型来存储64位的hash码
http://doc.java.sun.com/DocWeb/api/all/java.math.BigInteger
很好用,xor()异或、BigInteger.bitLength()取位长、.bitCount()取位为1的个数。
simHash算法流程:
1): 计算simHash码
a). 字符串String分词得到tokens;
b). 计算每个tokens的64位Hash码;
c). 按Hash码的位进行标记,1则标记为1、否则标记为-1;
d). 把每个tokens的Hash码按位进行统计求和;
e). 进行签名,大于0则为1,否则为0,得到64位simHash指纹。
2): 把64位simHash码均分为汉明距离n+1块,方便后续查找的所有近邻simHash码;
3): 计算两个simHash码的汉明距离,
方法一:给出simHash的64位二进制码字符串:str1.charAt(i) != str2.charAt(i);
方法二:给出simHash的int值:先做异或,然后统计异或后二进制位数为1的个数
问题!?:simHash在短文本的可行性!
测试相似文本的相似度与汉明距离
测试文本:20个城市名作为词串:北京,上海,香港,深圳,广州,台北,南京,大连,苏州,青岛,无锡,佛山,重庆,宁波,杭州,成都,武汉,澳门,天津,沈阳
相似度矩阵:
simHash码:
勘误:0.667, Hm:13 是对比的msg 1与2。
可见:相似度在0.8左右的Hamming距离为7,只有相似度高到0.9412,Hamming距离才近到4,此时,反观Google对此算法的应用场景:网页近重复。
镜像网站、内容复制、嵌入广告、计数改变、少量修改。
以上原因对于长文本来说造成的相似度都会比较高,而对于短文本来说,如何处理海量数据的相似度文本更为合适的??
附:simHash生成代码 参考:Simhash java实现
- import java.math.BigInteger;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.StringTokenizer;
- public class SimHash {
- private String tokens;
- private BigInteger intSimHash;
- private String strSimHash;
- private int hashbits = 64;
- public SimHash(String tokens) {
- this.tokens = tokens;
- this.intSimHash = this.simHash();
- }
- public SimHash(String tokens, int hashbits) {
- this.tokens = tokens;
- this.hashbits = hashbits;
- this.intSimHash = this.simHash();
- }
- public BigInteger simHash() {
- int[] v = new int[this.hashbits];
- StringTokenizer stringTokens = new StringTokenizer(this.tokens);
- while (stringTokens.hasMoreTokens()) {
- String temp = stringTokens.nextToken();
- BigInteger t = this.hash(temp);
- for (int i = 0; i < this.hashbits; i++) {
- BigInteger bitmask = new BigInteger(“1”).shiftLeft(i);
- if (t.and(bitmask).signum() != 0) {
- v[i] += 1;
- } else {
- v[i] -= 1;
- }
- }
- }
- BigInteger fingerprint = new BigInteger(“0”);
- StringBuffer simHashBuffer = new StringBuffer();
- for (int i = 0; i < this.hashbits; i++) {
- if (v[i] >= 0) {
- fingerprint = fingerprint.add(new BigInteger(“1”).shiftLeft(i));
- simHashBuffer.append(“1”);
- }else{
- simHashBuffer.append(“0”);
- }
- }
- this.strSimHash = simHashBuffer.toString();
- System.out.println(this.strSimHash + ” length “ + this.strSimHash.length());
- return fingerprint;
- }
- private BigInteger hash(String source) {
- if (source == null || source.length() == 0) {
- return new BigInteger(“0”);
- } else {
- char[] sourceArray = source.toCharArray();
- BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
- BigInteger m = new BigInteger(“1000003”);
- BigInteger mask = new BigInteger(“2”).pow(this.hashbits).subtract(
- new BigInteger(“1”));
- for (char item : sourceArray) {
- BigInteger temp = BigInteger.valueOf((long) item);
- x = x.multiply(m).xor(temp).and(mask);
- }
- x = x.xor(new BigInteger(String.valueOf(source.length())));
- if (x.equals(new BigInteger(“-1”))) {
- x = new BigInteger(“-2”);
- }
- return x;
- }
- }
- public int hammingDistance(SimHash other) {
- BigInteger x = this.intSimHash.xor(other.intSimHash);
- int tot = 0;
- //统计x中二进制位数为1的个数
- //我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,
- //我们看n能做多少次这样的操作就OK了。
- while (x.signum() != 0) {
- tot += 1;
- x = x.and(x.subtract(new BigInteger(“1”)));
- }
- return tot;
- }
- public int getDistance(String str1, String str2) {
- int distance;
- if (str1.length() != str2.length()) {
- distance = –1;
- } else {
- distance = 0;
- for (int i = 0; i < str1.length(); i++) {
- if (str1.charAt(i) != str2.charAt(i)) {
- distance++;
- }
- }
- }
- return distance;
- }
- public List subByDistance(SimHash simHash, int distance){
- int numEach = this.hashbits/(distance+1);
- List characters = new ArrayList();
- StringBuffer buffer = new StringBuffer();
- int k = 0;
- for( int i = 0; i < this.intSimHash.bitLength(); i++){
- boolean sr = simHash.intSimHash.testBit(i);
- if(sr){
- buffer.append(“1”);
- }
- else{
- buffer.append(“0”);
- }
- if( (i+1)%numEach == 0 ){
- BigInteger eachValue = new BigInteger(buffer.toString(),2);
- System.out.println(“—-“ +eachValue );
- buffer.delete(0, buffer.length());
- characters.add(eachValue);
- }
- }
- return characters;
- }
- public static void main(String[] args) {
- String s = “This is a test string for testing”;
- SimHash hash1 = new SimHash(s, 64);
- System.out.println(hash1.intSimHash + ” “ + hash1.intSimHash.bitLength());
- hash1.subByDistance(hash1, 3);
- System.out.println(“\n”);
- s = “This is a test string for testing, This is a test string for testing abcdef”;
- SimHash hash2 = new SimHash(s, 64);
- System.out.println(hash2.intSimHash+ ” “ + hash2.intSimHash.bitCount());
- hash1.subByDistance(hash2, 3);
- s = “This is a test string for testing als”;
- SimHash hash3 = new SimHash(s, 64);
- System.out.println(hash3.intSimHash+ ” “ + hash3.intSimHash.bitCount());
- hash1.subByDistance(hash3, 3);
- System.out.println(“============================”);
- int dis = hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
- System.out.println(hash1.hammingDistance(hash2) + ” “+ dis);
- int dis2 = hash1.getDistance(hash1.strSimHash,hash3.strSimHash);
- System.out.println(hash1.hammingDistance(hash3) + ” “ + dis2);
- }
- }