pyswt 2019-06-29
****由于最近需要做大规模的文本相似度的计算,所以用到了simhash+汉明距离来快速计算文本的相似度。** **simhash的原理如下图:其中的weight采用的是jieba的tf-idf的结果。****
**附上python3的源代码:**
import math
import jieba
import jieba.analyse
class SimHash(object):
def __init__(self): pass def getBinStr(self, source): if source == "": return 0 else: x = ord(source[0]) << 7 m = 1000003 mask = 2 ** 128 - 1 for c in source: x = ((x * m) ^ ord(c)) & mask x ^= len(source) if x == -1: x = -2 x = bin(x).replace('0b', '').zfill(64)[-64:] return str(x) def getWeight(self, source): # fake weight with keyword return ord(source) def unwrap_weight(self, arr): ret = "" for item in arr: tmp = 0 if int(item) > 0: tmp = 1 ret += str(tmp) return ret def simHash(self, rawstr): seg = jieba.cut(rawstr) keywords = jieba.analyse.extract_tags("|".join(seg), topK=100, withWeight=True) ret = [] for keyword, weight in keywords: binstr = self.getBinStr(keyword) keylist = [] for c in binstr: weight = math.ceil(weight) if c == "1": keylist.append(int(weight)) else: keylist.append(-int(weight)) ret.append(keylist) # 对列表进行"降维" rows = len(ret) cols = len(ret[0]) result = [] for i in range(cols): tmp = 0 for j in range(rows): tmp += int(ret[j][i]) if tmp > 0: tmp = "1" elif tmp <= 0: tmp = "0" result.append(tmp) return "".join(result) def getDistince(self, hashstr1, hashstr2): length = 0 for index, char in enumerate(hashstr1): if char == hashstr2[index]: continue else: length += 1 return length
if name == "__main__":
simhash = SimHash() s1 = u'I am very happy' s2 = u'I am very happu' hash1 = simhash.simHash(s1) hash2 = simhash.simHash(s2) distince = simhash.getDistince(hash1, hash2) value = 5 print("海明距离:", distince, "判定距离:", value, "是否相似:", distince<=value)