notebook.community

Edit and run



In [1]:

    
our_texts=["The cat sat on the mat", 
           "The cat saw the other cat on Sat while she sat", 
           "Excellent Smithers! Use the saw on the cat in the magic trick with the dog",
            "Excellent magic saw Smithers on the dog"]



In [2]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=0.1, stop_words='english', use_idf=True)
document_term_matrix=vectorizer.fit_transform(our_texts)
vocab=vectorizer.get_feature_names()



In [3]:

    
vocab









    Out[3]:





['cat',
 'dog',
 'excellent',
 'magic',
 'mat',
 'sat',
 'saw',
 'smithers',
 'trick',
 'use']



In [4]:

    
import pandas as pd



In [5]:

    
pd.DataFrame(document_term_matrix.toarray(), columns=vocab)

Similarity among documents



In [6]:

    
from sklearn.metrics.pairwise import cosine_similarity



In [7]:

    
similarity=cosine_similarity(document_term_matrix)



In [8]:

    
pd.DataFrame(similarity)

What if want to understand which words are more similar in this context?

Similarity among words



In [9]:

    
similarity=cosine_similarity(document_term_matrix.T)



In [10]:

    
pd.DataFrame(similarity, index=vocab, columns=vocab)



In [ ]:

	cat	dog	excellent	magic	mat	sat	saw	smithers	trick	use
0	0.448100	0.000000	0.000000	0.000000	0.702035	0.553492	0.000000	0.000000	0.000000	0.000000
1	0.600223	0.000000	0.000000	0.000000	0.000000	0.741395	0.300111	0.000000	0.000000	0.000000
2	0.277223	0.342426	0.342426	0.342426	0.000000	0.000000	0.277223	0.342426	0.434323	0.434323
3	0.000000	0.463468	0.463468	0.463468	0.000000	0.000000	0.375218	0.463468	0.000000	0.000000

	0	1	2	3
0	1.000000	0.679316	0.124224	0.000000
1	0.679316	1.000000	0.249593	0.112607
2	0.124224	0.249593	1.000000	0.738833
3	0.000000	0.112607	0.738833	1.000000

	cat	dog	excellent	magic	mat	sat	saw	smithers	trick	use
cat	1.000000	0.206256	0.206256	0.206256	0.561040	0.937830	0.580043	0.206256	0.347095	0.347095
dog	0.206256	1.000000	1.000000	1.000000	0.000000	0.000000	0.841010	1.000000	0.594236	0.594236
excellent	0.206256	1.000000	1.000000	1.000000	0.000000	0.000000	0.841010	1.000000	0.594236	0.594236
magic	0.206256	1.000000	1.000000	1.000000	0.000000	0.000000	0.841010	1.000000	0.594236	0.594236
mat	0.561040	0.000000	0.000000	0.000000	1.000000	0.598232	0.000000	0.000000	0.000000	0.000000
sat	0.937830	0.000000	0.000000	0.000000	0.598232	1.000000	0.433532	0.000000	0.000000	0.000000
saw	0.580043	0.841010	0.841010	0.841010	0.000000	0.433532	1.000000	0.841010	0.499758	0.499758
smithers	0.206256	1.000000	1.000000	1.000000	0.000000	0.000000	0.841010	1.000000	0.594236	0.594236
trick	0.347095	0.594236	0.594236	0.594236	0.000000	0.000000	0.499758	0.594236	1.000000	1.000000
use	0.347095	0.594236	0.594236	0.594236	0.000000	0.000000	0.499758	0.594236	1.000000	1.000000