利用simhash计算文本相似度

摘自:http://www.programcreek.com/java-api-examples/index.php?source_dir=textmining-master/src/com/gta/simhash/SimHash.java

package com.gta.simhash;
 
public class Test { 
 
 public static void main(String[] args) { 
  // TODO Auto-generated method stub 
       
  String s3 = "�������Ϻ���������죬���������������������Ͼ������ݣ����������ţ����ϣ��ൺ���人�����ݣ����ڣ��ɶ���������̫ԭ����ɳ�����֣�������֣�ݣ���������������³ľ�룬���ݣ��������Ϸʣ��ߺ�"; 
  String s4 = "�������Ϻ���������죬���������������������Ͼ������ݣ����������ţ����ϣ��ൺ���人�����ݣ����ڣ��ɶ���������̫ԭ����ɳ�����֣�������֣�ݣ�����"; 
  SimHash hash1 = new SimHash(s3, 64, 8); 
  SimHash hash2 = new SimHash(s4, 64, 8); 
  hash1.getResult(hash2); 
 } 
  
}
package com.gta.simhash;
 
import java.io.IOException; 
import java.math.BigInteger; 
import java.util.List; 
import java.util.ArrayList; 
 
import org.wltea.analyzer.lucene.IKAnalyzer; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 
 
public class SimHash { 
 private String tokens; 
    private int hashBits = 64; 
    private int distance = 5; 
  
 public SimHash(String tokens) 
 { 
  this.tokens = tokens; 
 } 
  
  
 public SimHash(String tokens, int hashBits, int distance) 
 { 
  this.tokens = tokens; 
  this.hashBits = hashBits; 
  this.distance = distance; 
 } 
  
  
 public List<TermDict> tokenizer() 
 { 
  List<TermDict> terms = new ArrayList<TermDict>(); 
  IKAnalyzer analyzer = new IKAnalyzer(true); 
  try { 
   TokenStream stream = analyzer.tokenStream("", this.tokens); 
   CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); 
   stream.reset(); 
   int index = -1; 
   while (stream.incrementToken())  
   { 
    if ((index = isContain(cta.toString(), terms)) >= 0) 
    { 
     terms.get(index).setFreq(terms.get(index).getFreq()+1); 
    } 
    else  
    { 
     terms.add(new TermDict(cta.toString(), 1)); 
    } 
   } 
   analyzer.close(); 
  } catch (IOException e) { 
   e.printStackTrace(); 
  } 
  return terms; 
 } 
  
  
 public int isContain(String str, List<TermDict> terms) 
 { 
  for (TermDict td : terms) 
  { 
   if (str.equals(td.getTerm())) 
   { 
    return terms.indexOf(td); 
   } 
  } 
  return -1; 
 } 
  
  
 public BigInteger simHash(List<TermDict> terms) 
 { 
  int []v = new int[hashBits]; 
  for (TermDict td : terms) 
  { 
   String str = td.getTerm(); 
   int weight = td.getFreq(); 
   BigInteger bt = shiftHash(str); 
   for (int i = 0; i < hashBits; i++) 
   { 
    BigInteger bitmask = new BigInteger("1").shiftLeft(i); 
    if ( bt.and(bitmask).signum() != 0) 
    { 
     v[i] += weight; 
    } 
    else 
    { 
     v[i] -= weight; 
    } 
   } 
  } 
   
  BigInteger fingerPrint = new BigInteger("0"); 
  for (int i = 0; i < hashBits; i++) 
  { 
   if (v[i] >= 0) 
   { 
    fingerPrint = fingerPrint.add(new BigInteger("1").shiftLeft(i));   // update the correct fingerPrint 
   } 
  } 
  return fingerPrint; 
 } 
  
  
 public BigInteger shiftHash(String str) 
 { 
  if (str == null || str.length() == 0) 
  { 
   return new BigInteger("0"); 
  } 
  else  
  { 
   char[] sourceArray = str.toCharArray(); 
   BigInteger x = BigInteger.valueOf((long) sourceArray[0] << 7); 
   BigInteger m = new BigInteger("131313"); 
   for (char item : sourceArray) 
   { 
    x = x.multiply(m).add(BigInteger.valueOf((long)item)); 
   } 
   BigInteger mask = new BigInteger("2").pow(hashBits).subtract(new BigInteger("1")); 
   boolean flag = true; 
   for (char item : sourceArray) 
   { 
    if (flag) 
    { 
     BigInteger tmp = BigInteger.valueOf((long)item << 3); 
     x = x.multiply(m).xor(tmp).and(mask); 
    } 
    else 
    { 
     BigInteger tmp = BigInteger.valueOf((long)item >> 3); 
     x = x.multiply(m).xor(tmp).and(mask); 
    } 
    flag = !flag; 
   } 
    
   if (x.equals(new BigInteger("-1"))) 
   { 
    x = new BigInteger("-2"); 
   } 
   return x; 
  } 
 } 
  
  
 public BigInteger getSimHash() 
 { 
  return simHash(tokenizer()); 
 } 
  
  
 public int getHammingDistance(SimHash hashData) 
 { 
  BigInteger m = new BigInteger("1").shiftLeft(hashBits).subtract(new BigInteger("1")); 
  System.out.println(getFingerPrint(getSimHash().toString(2))); 
  System.out.println(getFingerPrint(hashData.getSimHash().toString(2))); 
  BigInteger x = getSimHash().xor(hashData.getSimHash()).and(m); 
  int tot = 0; 
  while (x.signum() != 0) 
  { 
   tot += 1; 
   x = x.and(x.subtract(new BigInteger("1"))); 
  } 
  System.out.println(tot); 
  return tot; 
 } 
  
  
 public String getFingerPrint(String str) 
 { 
  int len = str.length(); 
  for (int i = 0; i < hashBits; i++) 
  { 
   if (i >= len) 
   { 
    str = "0" + str; 
   } 
  } 
  return str; 
 } 
  
  
 public void getResult(SimHash hashData) 
 { 
  if (getHammingDistance(hashData) <= distance) 
  { 
   System.out.println("match"); 
  } 
  else 
  { 
   System.out.println("false"); 
  } 
 } 
  
}
原文地址:https://www.cnblogs.com/twodog/p/12141243.html