使用set来

集合用来删除重复的值

使用jaccard系数来计算2句话的相似度


In [1]:
#初始化2个句子

st_1 = "you are beautiful"
st_2 = "you are a beauty"

# 创建集合
st_1_words = set(st_1.split())
st_2_words = set(st_2.split())

# 每个集合的大小
c_st_1_words = len(st_1_words)
c_st_2_words = len(st_2_words)

# 两个集合共有的词
com_words = st_1_words.intersection(st_2_words)
c_com_words = len(com_words)

# 两个集合不重复的词
uniq_words = st_1_words.union(st_2_words)
c_uniq_words = len(uniq_words)

# 计算jaccard相似度

similarity = c_com_words/(1.0*c_uniq_words)


# print the result
print 'set1 is ',st_1_words
print 'set2 is ',st_2_words
print 'commen words count is ',c_com_words
print 'unique words count is ',c_uniq_words
print 'commen words is ',com_words
print 'uniq words is ',uniq_words
print 'Similarity is :',similarity


set1 is  set(['beautiful', 'you', 'are'])
set2 is  set(['a', 'you', 'are', 'beauty'])
commen words count is  2
unique words count is  5
commen words is  set(['you', 'are'])
uniq words is  set(['beautiful', 'a', 'are', 'beauty', 'you'])
Similarity is : 0.4

In [ ]: