In [10]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_train = [
    "He has written two novels so far",
    "Two novels have been written by him so far",
    "They will plant ten trees tomorrow",
    "Ten trees will be planted by them tomorrow",
    "Bruce writes a letter every week",
]
#todo: read ./train.db 

data_refine = []
for sentence in data_train:
    data_refine.append(sentence.lower())
    #todo: leave out special character

data_refine


Out[10]:
['he has written two novels so far',
 'two novels have been written by him so far',
 'they will plant ten trees tomorrow',
 'ten trees will be planted by them tomorrow',
 'bruce writes a letter every week']

In [18]:
shingle_k = 5

def shingle_extract(sentence, shingle_k):
    """extract k elements from sentence step by step.
    
    """
    shingle_elements = []
    while (len(sentence) >= shingle_k):
        element = sentence[0:shingle_k]
        if element not in shingle_elements:
            shingle_elements.append(element)
        sentence = sentence[1:]
    return frozenset(shingle_elements)

shingle_dict = []
for sentence in data_refine:
    shingle_dict.append(shingle_extract(sentence, shingle_k))

shingle_dict


Out[18]:
[frozenset({' has ',
            ' nove',
            ' so f',
            ' two ',
            ' writ',
            'as wr',
            'e has',
            'els s',
            'en tw',
            'has w',
            'he ha',
            'itten',
            'ls so',
            'n two',
            'novel',
            'o far',
            'o nov',
            'ovels',
            'ritte',
            's so ',
            's wri',
            'so fa',
            'ten t',
            'tten ',
            'two n',
            'vels ',
            'wo no',
            'writt'}),
 frozenset({' been',
            ' by h',
            ' have',
            ' him ',
            ' nove',
            ' so f',
            ' writ',
            'ave b',
            'been ',
            'by hi',
            'e bee',
            'een w',
            'els h',
            'en by',
            'en wr',
            'have ',
            'him s',
            'im so',
            'itten',
            'ls ha',
            'm so ',
            'n by ',
            'n wri',
            'novel',
            'o far',
            'o nov',
            'ovels',
            'ritte',
            's hav',
            'so fa',
            'ten b',
            'tten ',
            'two n',
            've be',
            'vels ',
            'wo no',
            'writt',
            'y him'}),
 frozenset({' plan',
            ' ten ',
            ' tomo',
            ' tree',
            ' will',
            'ant t',
            'ees t',
            'en tr',
            'es to',
            'ey wi',
            'hey w',
            'ill p',
            'l pla',
            'lant ',
            'll pl',
            'morro',
            'n tre',
            'nt te',
            'omorr',
            'orrow',
            'plant',
            'rees ',
            's tom',
            't ten',
            'ten t',
            'they ',
            'tomor',
            'trees',
            'will ',
            'y wil'}),
 frozenset({' be p',
            ' by t',
            ' plan',
            ' them',
            ' tomo',
            ' tree',
            ' will',
            'anted',
            'be pl',
            'by th',
            'd by ',
            'e pla',
            'ed by',
            'ees w',
            'em to',
            'en tr',
            'es wi',
            'hem t',
            'ill b',
            'l be ',
            'lante',
            'll be',
            'm tom',
            'morro',
            'n tre',
            'nted ',
            'omorr',
            'orrow',
            'plant',
            'rees ',
            's wil',
            'ted b',
            'ten t',
            'them ',
            'tomor',
            'trees',
            'will ',
            'y the'}),
 frozenset({' a le',
            ' ever',
            ' lett',
            ' week',
            ' writ',
            'a let',
            'bruce',
            'ce wr',
            'e wri',
            'er ev',
            'ery w',
            'es a ',
            'etter',
            'every',
            'ites ',
            'lette',
            'r eve',
            'rites',
            'ruce ',
            'ry we',
            's a l',
            'ter e',
            'tes a',
            'tter ',
            'uce w',
            'very ',
            'write',
            'y wee'})]

In [30]:
def jaccard_similarity_calculate(set_a, set_b):
    """calculate the Jaccard Similarity of two sets.

    Jaccard Similarity: (a n b) / (a u b)
    """
    return np.array(len(set_a.intersection(set_b)), dtype=np.double) / np.array(len(set_a.union(set_b)), dtype=np.double)

def combination_calculate(n,k): 
    """calculate the number of all possible k combination of n elements.

    combination = n! / (k!)(n-k)!
    """
    import operator  
    return  reduce(operator.mul, range(n - k + 1, n + 1)) / reduce(operator.mul, range(1, k +1)) 

sentence_num = len(shingle_dict)
jaccard_similarity_matrix = np.zeros((sentence_num, sentence_num))
jaccard_similarity_matrix


Out[30]:
array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [ ]:
for