In [3]:
#初始化2个句子
st_1 = "you are beautiful"
st_2 = "you are a beauty"
# 创建集合
st_1_words = set(st_1.split())
st_2_words = set(st_2.split())
# 每个集合的大小
c_st_1_words = len(st_1_words)
c_st_2_words = len(st_2_words)
# 两个集合共有的词
com_words = st_1_words.intersection(st_2_words)
c_com_words = len(com_words)
# 两个集合不重复的词
uniq_words = st_1_words.union(st_2_words)
c_uniq_words = len(uniq_words)
# 计算jaccard相似度
similarity = c_com_words/(1.0*c_uniq_words)
# print the result
print 'set1 is ',st_1_words
print 'set2 is ',st_2_words
print 'commen words count is ',c_com_words
print 'unique words count is ',c_uniq_words
print 'commen words is ',com_words
print 'uniq words is ',uniq_words
print 'Similarity is :',similarity