场景:
1.计算SimHash值,及Hamming距离。
2.SimHash适用于较长文本(大于三五百字)的相似性比较,文本越短误判率越高。
Python实现:
代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# -*- encoding:utf-8 -*- import math import jieba import jieba.analyse class SimHash( object ): def getBinStr( self , source): if source = = "": return 0 else : x = ord (source[ 0 ]) << 7 m = 1000003 mask = 2 * * 128 - 1 for c in source: x = ((x * m) ^ ord (c)) & mask x ^ = len (source) if x = = - 1 : x = - 2 x = bin (x).replace( '0b' , '').zfill( 64 )[ - 64 :] return str (x) def getWeight( self , source): return ord (source) def unwrap_weight( self , arr): ret = "" for item in arr: tmp = 0 if int (item) > 0 : tmp = 1 ret + = str (tmp) return ret def sim_hash( self , rawstr): seg = jieba.cut(rawstr) keywords = jieba.analyse.extract_tags( "|" .join(seg), topK = 100 , withWeight = True ) ret = [] for keyword, weight in keywords: binstr = self .getBinStr(keyword) keylist = [] for c in binstr: weight = math.ceil(weight) if c = = "1" : keylist.append( int (weight)) else : keylist.append( - int (weight)) ret.append(keylist) # 降维 rows = len (ret) cols = len (ret[ 0 ]) result = [] for i in range (cols): tmp = 0 for j in range (rows): tmp + = int (ret[j][i]) if tmp > 0 : tmp = "1" elif tmp < = 0 : tmp = "0" result.append(tmp) return "".join(result) def distince( self , hashstr1, hashstr2): length = 0 for index, char in enumerate (hashstr1): if char = = hashstr2[index]: continue else : length + = 1 return length if __name__ = = "__main__" : simhash = SimHash() str1 = '咱哥俩谁跟谁啊' str2 = '咱们俩谁跟谁啊' hash1 = simhash.sim_hash(str1) print (hash1) hash2 = simhash.sim_hash(str2) distince = simhash.distince(hash1, hash2) value = 5 print ( "simhash" , distince, "距离:" , value, "是否相似:" , distince< = value) |
以上就是论文查重python文本相似性计算simhash源码的详细内容,更多关于python文本相似性计算simhash的资料请关注服务器之家其它相关文章!
原文链接:https://coderl.blog.csdn.net/article/details/122740744