用simhash比较几句话的相似性

现在能做到的是把每个字的hash值求出并存放在string[]中,接下来就不太会了,求大神指导

//words [0]为属性 [1]为权重

//hashbits hashCode权重

//return SimHash串

getSimHash(String[][] words,int hashbits)

------------------------------------------------------------------

//计算汉明距离

//str1 simHash生成的code

//str2

//return 整形距离越小越相似

getDistance(str1,str2)

----------------------------------------------------------------------

下面是代码


package com.yeahmobi.ymconv.util;

public class MySimHash {

    public static String getSimHash(String[][] words, int hashbits) {

        double[] hash = new double[hashbits];

        for (int i = 0; i < words.length; i++) {
            long t = MurmurHash.hash64(words[i][0]);
            // long t = hash(words[i][0], 64).longValue();

            String str = getZero(Long.toBinaryString(t), hashbits);
            for (int j = 0; j < str.length(); j++) {
                int weights = Integer.parseInt(words[i][1]) <= 0 ? 1 : Integer.parseInt(words[i][1]);
                int c = Integer.parseInt(str.charAt(j) + "");
                if (c == 1)
                    hash[j] = hash[j] + (weights);
                else
                    hash[j] = hash[j] + (-weights);
            }
        }

        String hash1 = "";
        for (double d : hash) {
            hash1 += d > 0 ? "1" : "0";
        }

        return hash1;
    }

    public static String getZero(String str, int hashbits) {
        return String.format("%" + hashbits + "s", str).replace(" ", "0");
    }

    public static int getDistance(String str1, String str2) {
        int distance;
        if (str1.length() != str2.length()) {
            distance = -1;
        } else {
            distance = 0;
            for (int i = 0; i < str1.length(); i++) {
                if (str1.charAt(i) != str2.charAt(i)) {
                    distance++;
                }
            }
        }
        return distance;
    }

    public static void main(String[] args) {
//      String s1 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "3" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
//      String s2 = MySimHash.getSimHash(new String[][] { { "177.224.174.214", "1" }, { "mx", "1" }, { "775", "1" }, { "6177", "1" }, { "2478822", "1" }, { "generic", "1" }, { "storm", "1" }, { "android", "1" }, { "4.2.2", "1" } }, 64);
//      String s3 = MySimHash.getSimHash(new String[][] { { "5.246.82.36", "1" }, { "sdf", "1" }, { "663", "1" }, { "333", "1" }, { "55", "0" }, { "sd", "1" }, { "er", "1" }, { "34", "1" }, { "sdfasdf", "1" }, { "hh", "1" } }, 64);
//      String s4 = MySimHash.getSimHash(new String[][] { { "189.132.168.157", "1" }, { "mx", "1" }, { "390", "1" }, { "3203", "1" }, { "2342277", "1" }, { "samsung", "1" }, { "gt-i8190l", "1" }, { "android", "1" }, { "4.1.2", "1" } }, 64);
//      String s5 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "1" }, { "mx", "1" }, { "775", "3" }, { "60541", "1" }, { "2342256", "1" }, { "alcatel", "1" }, { "onetouch5020", "1" }, { "android", "1" }, { "4.1.1", "1" }, { "hh", "1" } }, 64);
//      String s6 = MySimHash.getSimHash(new String[][] { { "187.237.239.25", "3" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
//      String s7 = MySimHash.getSimHash(new String[][] { { "187.237.239.16", "1" }, { "mx", "3" }, { "775", "3" }, { "60541", "3" }, { "2342256", "3" }, { "alcatel", "3" }, { "onetouch5020", "3" }, { "android", "3" }, { "4.1.1", "3" }, { "hh", "3" } }, 64);
//      System.out.println("----------");
//      System.out.println(MySimHash.getDistance(s1, s2));
//      System.out.println(MySimHash.getDistance(s1, s3));
//      System.out.println(MySimHash.getDistance(s1, s4));
//      System.out.println(MySimHash.getDistance(s1, s5));
//      System.out.println(MySimHash.getDistance(s1, s6));
//      System.out.println(MySimHash.getDistance(s1, s7));
//
//      System.out.println(s1);
//      System.out.println(s2);
//      System.out.println(s3);
//      System.out.println(s4);
//      System.out.println(s5);
//      System.out.println(s6);
    }
}