In [106]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.corpus import gutenberg
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import math
from textblob import TextBlob as tb

Drill: tf-idf scores


In [107]:
#Define tf_idf
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [108]:
#Data
phrase1 = tb("The best Monty Python sketch is the one about the dead parrot, I laughed so hard.")
phrase2 = tb("I laugh when I think about Python's Ministry of Silly Walks sketch, it is funny, funny, funny, the best!")
phrase3 = tb("Chocolate is the best ice cream dessert topping, with a great taste.")
phrase4 = tb("The Lumberjack Song is the funniest Monty Python bit: I can't think of it without laughing.")
phrase5 = tb("I would rather put strawberries on my ice cream for dessert, they have the best taste.")
phrase6 = tb("The taste of caramel is a fantastic accompaniment to tasty mint ice cream.")

In [109]:
#Build list and vocabulary
bloblist = [phrase1, phrase2, phrase3, phrase4, phrase5, phrase6]
voc = ['Monty', 'Python', 'sketch', 'laugh', 'funny', 'best', 'ice cream', 'dessert', 'taste']

for i, blob in enumerate(bloblist):
    print("Top words in phrase {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in voc}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:9]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))


Top words in phrase 1
	Word: sketch, TF-IDF: 0.04332
	Word: Monty, TF-IDF: 0.04332
	Word: Python, TF-IDF: 0.02534
	Word: best, TF-IDF: 0.0114
	Word: taste, TF-IDF: 0.0
	Word: funny, TF-IDF: 0.0
	Word: laugh, TF-IDF: 0.0
	Word: ice cream, TF-IDF: 0.0
	Word: dessert, TF-IDF: 0.0
Top words in phrase 2
	Word: funny, TF-IDF: 0.16479
	Word: laugh, TF-IDF: 0.05493
	Word: sketch, TF-IDF: 0.03466
	Word: Python, TF-IDF: 0.02027
	Word: best, TF-IDF: 0.00912
	Word: taste, TF-IDF: 0.0
	Word: ice cream, TF-IDF: 0.0
	Word: Monty, TF-IDF: 0.0
	Word: dessert, TF-IDF: 0.0
Top words in phrase 3
	Word: dessert, TF-IDF: 0.05776
	Word: taste, TF-IDF: 0.03379
	Word: best, TF-IDF: 0.01519
	Word: funny, TF-IDF: 0.0
	Word: laugh, TF-IDF: 0.0
	Word: sketch, TF-IDF: 0.0
	Word: ice cream, TF-IDF: 0.0
	Word: Monty, TF-IDF: 0.0
	Word: Python, TF-IDF: 0.0
Top words in phrase 4
	Word: Monty, TF-IDF: 0.04077
	Word: Python, TF-IDF: 0.02385
	Word: best, TF-IDF: 0.0
	Word: taste, TF-IDF: 0.0
	Word: funny, TF-IDF: 0.0
	Word: laugh, TF-IDF: 0.0
	Word: sketch, TF-IDF: 0.0
	Word: ice cream, TF-IDF: 0.0
	Word: dessert, TF-IDF: 0.0
Top words in phrase 5
	Word: dessert, TF-IDF: 0.04332
	Word: taste, TF-IDF: 0.02534
	Word: best, TF-IDF: 0.0114
	Word: funny, TF-IDF: 0.0
	Word: laugh, TF-IDF: 0.0
	Word: sketch, TF-IDF: 0.0
	Word: ice cream, TF-IDF: 0.0
	Word: Monty, TF-IDF: 0.0
	Word: Python, TF-IDF: 0.0
Top words in phrase 6
	Word: taste, TF-IDF: 0.03119
	Word: best, TF-IDF: 0.0
	Word: funny, TF-IDF: 0.0
	Word: laugh, TF-IDF: 0.0
	Word: sketch, TF-IDF: 0.0
	Word: ice cream, TF-IDF: 0.0
	Word: Monty, TF-IDF: 0.0
	Word: Python, TF-IDF: 0.0
	Word: dessert, TF-IDF: 0.0

In [110]:
corpus = ["The best Monty Python sketch is the one about the dead parrot, I laughed so hard.",
          "I laugh when I think about Python's Ministry of Silly Walks sketch, it is funny, funny, funny, the best!",
          "Chocolate is the best ice cream dessert topping, with a great taste.",
          "The Lumberjack Song is the funniest Monty Python bit: I can't think of it without laughing.",
          "I would rather put strawberries on my ice cream for dessert, they have the best taste.",
          "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."]

vectorizer = TfidfVectorizer(min_df=1)

X = vectorizer.fit_transform(corpus)

idf = vectorizer.idf_

features = vectorizer.get_feature_names()
tfidf = dict(zip(vectorizer.get_feature_names(), idf))
tfidf


Out[110]:
{'about': 1.8472978603872037,
 'accompaniment': 2.2527629684953681,
 'best': 1.336472236621213,
 'bit': 2.2527629684953681,
 'can': 2.2527629684953681,
 'caramel': 2.2527629684953681,
 'chocolate': 2.2527629684953681,
 'cream': 1.5596157879354227,
 'dead': 2.2527629684953681,
 'dessert': 1.8472978603872037,
 'fantastic': 2.2527629684953681,
 'for': 2.2527629684953681,
 'funniest': 2.2527629684953681,
 'funny': 2.2527629684953681,
 'great': 2.2527629684953681,
 'hard': 2.2527629684953681,
 'have': 2.2527629684953681,
 'ice': 1.5596157879354227,
 'is': 1.1541506798272583,
 'it': 1.8472978603872037,
 'laugh': 2.2527629684953681,
 'laughed': 2.2527629684953681,
 'laughing': 2.2527629684953681,
 'lumberjack': 2.2527629684953681,
 'ministry': 2.2527629684953681,
 'mint': 2.2527629684953681,
 'monty': 1.8472978603872037,
 'my': 2.2527629684953681,
 'of': 1.5596157879354227,
 'on': 2.2527629684953681,
 'one': 2.2527629684953681,
 'parrot': 2.2527629684953681,
 'put': 2.2527629684953681,
 'python': 1.5596157879354227,
 'rather': 2.2527629684953681,
 'silly': 2.2527629684953681,
 'sketch': 1.8472978603872037,
 'so': 2.2527629684953681,
 'song': 2.2527629684953681,
 'strawberries': 2.2527629684953681,
 'taste': 1.5596157879354227,
 'tasty': 2.2527629684953681,
 'the': 1.0,
 'they': 2.2527629684953681,
 'think': 1.8472978603872037,
 'to': 2.2527629684953681,
 'topping': 2.2527629684953681,
 'walks': 2.2527629684953681,
 'when': 2.2527629684953681,
 'with': 2.2527629684953681,
 'without': 2.2527629684953681,
 'would': 2.2527629684953681}

Drills


In [111]:
#reading in the data, this time in the form of paragraphs
emma=gutenberg.paras('austen-emma.txt')

#processing
emma_paras=[]
for paragraph in emma:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    para=[re.sub(r'VOLUME \w+', '', word) for word in para]
    para=[re.sub(r'CHAPTER \w+', '', word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    emma_paras.append(' '.join(para))

print(emma_paras[0:4])


['[ Emma by Jane Austen 1816 ]', 'VOLUME I', 'CHAPTER I', 'Emma Woodhouse , handsome , clever , and rich , with a comfortable home and happy disposition , seemed to unite some of the best blessings of existence ; and had lived nearly twenty - one years in the world with very little to distress or vex her .']

In [112]:
#Split data and vectorize
X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.9, # drop words that occur in more than half the paragraphs
                             min_df=10, # only use words that appear at X times (being X the number used)
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=False,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True, #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                           )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])


Number of features: 382
Original sentence: A very few minutes more , however , completed the present trial .
Tf_idf vector: {'minutes': 0.70710678118654746, 'present': 0.70710678118654746}

In [113]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 75.8663260096
Component 0:
" You have made her too tall , Emma ," said Mr . Knightley .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     0.800488
Emma could not have desired a more spirited rejection of Mr . Martin ' s prose .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 0.796912
" You get upon delicate subjects , Emma ," said Mrs . Weston smiling ; " remember that I am here . Mr .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          0.781704
Mrs . Weston was acting no part , feigning no feelings in all that she said to him in favour of the event . She had been extremely surprized , never more so , than when Emma first opened the affair to her ; but she saw in it only increase of happiness to all , and had no scruple in urging him to the utmost . She had such a regard for Mr . Knightley , as to think he deserved even her dearest Emma ; and it was in every respect so proper , suitable , and unexceptionable a connexion , and in one respect , one point of the highest importance , so peculiarly eligible , so singularly fortunate , that now it seemed as if Emma could not safely have attached herself to any other creature , and that she had herself been the stupidest of beings in not having thought of it , and wished it long ago . How very few of those men in a rank of life to address Emma would have renounced their own home for Hartfield !    0.767525
" Emma ," said Mr . Knightley presently , " I have a piece of news for you .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     0.766765
Mr . Knightley might quarrel with her , but Emma could not quarrel with herself .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                0.762165
Emma found that it was not Mr . Weston ' s fault that the number of privy councillors was not yet larger .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       0.761571
Emma was most agreeably surprized . Mr .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         0.753738
The intermediate month was the one fixed on , as far as they dared , by Emma and Mr .                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            0.743976
" Now ," said Emma , when they were fairly beyond the sweep gates , " now Mr . Weston , do let me know what has happened ."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      0.724880
Name: 0, dtype: float64
Component 1:
" Oh !     0.997991
" Oh !"    0.997991
" Oh !     0.997991
" Oh !     0.997991
" Oh !"    0.997991
" Oh !     0.997991
" Oh !     0.997991
" Oh !     0.997991
" Oh !     0.997991
" Oh !     0.997991
Name: 1, dtype: float64
Component 2:
" In one respect , perhaps , Mr . Elton ' s manners are superior to Mr . Knightley ' s or Mr . Weston ' s .                                                                                                                                                                                                                                                                          0.750133
" Why will not you write one yourself for us , Mr .                                                                                                                                                                                                                                                                                                                                  0.682854
"` Mr .                                                                                                                                                                                                                                                                                                                                                                              0.682854
The carriage came : and Mr . Woodhouse , always the first object on such occasions , was carefully attended to his own by Mr . Knightley and Mr . Weston ; but not all that either could say could prevent some renewal of alarm at the sight of the snow which had actually fallen , and the discovery of a much darker night than he had been prepared for .                       0.676512
Mr . Knightley had done all in his power for Mr . Woodhouse ' s entertainment .                                                                                                                                                                                                                                                                                                      0.671532
Mr . Woodhouse at last was off ; but Mr . Knightley , instead of being immediately off likewise , sat down again , seemingly inclined for more chat .                                                                                                                                                                                                                                0.652959
" Mr . Knightley was there too , was he ?"                                                                                                                                                                                                                                                                                                                                           0.634365
Mr . Knightley grew angry .                                                                                                                                                                                                                                                                                                                                                          0.634365
She had not time to know how Mr . Elton took the reproof , so rapidly did another subject succeed ; for Mr . John Knightley now came into the room from examining the weather , and opened on them all with the information of the ground being covered with snow , and of its still snowing fast , with a strong drifting wind ; concluding with these words to Mr . Woodhouse :    0.633847
" I do own myself to have been completely mistaken in Mr . Elton .                                                                                                                                                                                                                                                                                                                   0.622071
Name: 2, dtype: float64
Component 3:
" So do I ," said Mrs . Weston gently , " very much ."                                                                                                                                                                                                           0.723249
" No trouble in the world , ma ' am ," said the obliging Mrs . Ford .                                                                                                                                                                                            0.657123
" Well ," said Mrs . Weston , smiling , " you give him credit for more simple , disinterested benevolence in this instance than I do ; for while Miss Bates was speaking , a suspicion darted into my head , and I have never been able to get it out again .    0.632900
" My advice ," said Mrs . Weston kindly and persuasively , " I certainly do feel tempted to give .                                                                                                                                                               0.621415
" Now , ma ' am ," said Jane to her aunt , " shall we join Mrs .                                                                                                                                                                                                 0.587568
Miss Bates and Miss Fairfax , escorted by the two gentlemen , walked into the room ; and Mrs . Elton seemed to think it as much her duty as Mrs . Weston ' s to receive them .                                                                                   0.579231
" Well done , Mrs .                                                                                                                                                                                                                                              0.573555
[ To Mrs .                                                                                                                                                                                                                                                       0.573555
" How do you do , Mrs . Ford ?                                                                                                                                                                                                                                   0.573555
" Mrs . Dixon !                                                                                                                                                                                                                                                  0.573555
Name: 3, dtype: float64
Component 4:
CHAPTER II      1.0
CHAPTER IV      1.0
CHAPTER II      1.0
CHAPTER III     1.0
CHAPTER XVI     1.0
CHAPTER XIV     1.0
CHAPTER X       1.0
CHAPTER V       1.0
CHAPTER X       1.0
CHAPTER XVII    1.0
Name: 4, dtype: float64

In [114]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])


Key:
0 That is _court_ .
1 " Yes , sir , I did indeed ; and I am very much obliged by your kind solicitude about me ."
2 " How much his business engrosses him already is very plain from the circumstance of his forgetting to inquire for the book you recommended .
3 To restrain him as much as might be , by her own manners , she was immediately preparing to speak with exquisite calmness and gravity of the weather and the night ; but scarcely had she begun , scarcely had they passed the sweep - gate and joined the other carriage , than she found her subject cut up  her hand seized  her attention demanded , and Mr . Elton actually making violent love to her : availing himself of the precious opportunity , declaring sentiments which must be already well known , hoping  fearing  adoring  ready to die if she refused him ; but flattering himself that his ardent attachment and unequalled love and unexampled passion could not fail of having some effect , and in short , very much resolved on being seriously accepted as soon as possible .
4 Emma smiled and answered " My visit was of use to the nervous part of her complaint , I hope ; but not even I can charm away a sore throat ; it is a most severe cold indeed .
5 A very few minutes more , however , completed the present trial .
6 " I am delighted to hear you speak so stoutly on the subject ," replied Emma , smiling ; " but you do not mean to deny that there was a time  and not very distant either  when you gave me reason to understand that you did care about him ?"
7 " Very well ; and if he had intended to give her one , he would have told her so ."
8 Some laughed , and answered good - humouredly .
9 " There appeared such a perfectly good understanding among them all " he began rather quickly , but checking himself , added , " however , it is impossible for me to say on what terms they really were  how it might all be behind the scenes .

Drill 0: Test set


In [115]:
#Reshapes the vectorizer output into something people can read
X_test_tfidf_csr = X_test_tfidf.tocsr()

#number of paragraphs
n = X_test_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_test_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_test_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_test[5])
print('Tf_idf vector:', tfidf_bypara[5])


Original sentence: " And I am quite serious too , I assure you ," replied Mrs . Elton gaily , " in resolving to be always on the watch , and employing my friends to watch also , that nothing really unexceptionable may pass us ."
Tf_idf vector: {'really': 0.35355339059327373, 'pass': 0.35355339059327373, 'assure': 0.35355339059327373, 'mrs': 0.35355339059327373, 'friends': 0.35355339059327373, 'quite': 0.35355339059327373, 'elton': 0.35355339059327373, 'replied': 0.35355339059327373}

In [116]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_test_lsa,index=X_test)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 79.6598094568
Component 0:
The event was more favourable to Mr . Woodhouse than to Emma .                                                                                                                                                                            0.717394
" Well , Mrs . Weston ," said Emma triumphantly when he left them , " what do you say now to Mr . Knightley ' s marrying Jane Fairfax ?"                                                                                                  0.698278
In this walk Emma and Mr . Weston found all the others assembled ; and towards this view she immediately perceived Mr . Knightley and Harriet distinct from the rest , quietly leading the way .                                          0.697127
" He is a person I never think of from one month ' s end to another ," said Mr . Knightley , with a degree of vexation , which made Emma immediately talk of something else , though she could not comprehend why he should be angry .    0.684809
Emma and Harriet had been walking together one morning , and , in Emma ' s opinion , had been talking enough of Mr . Elton for that day .                                                                                                 0.676590
Mr . Elton had retreated into the card - room , looking ( Emma trusted ) very foolish .                                                                                                                                                   0.672216
Emma was extremely gratified . They were interrupted by the bustle of Mr . Weston calling on every body to begin dancing again .                                                                                                          0.670927
" And when ," thought Emma , " will there be a beginning of Mr .                                                                                                                                                                          0.657018
Emma would not have smiled for the world , and only said , " Is Mr . Elton gone on foot to Donwell ? He will have a hot walk ."                                                                                                           0.648608
Emma was more than half in hopes of Mr . Elton ' s having dropt a hint .                                                                                                                                                                  0.642841
Name: 0, dtype: float64
Component 1:
" Oh !            0.999456
" Oh no , no !    0.999456
" Oh !            0.999456
" Oh !            0.999456
" Oh !            0.999456
" Oh !            0.999456
" Oh !            0.999456
" Oh !            0.999456
" Me ! oh !       0.999456
" Oh !            0.999456
Name: 1, dtype: float64
Component 2:
" She must have some motive , more powerful than appears , for refusing this invitation ," was Emma ' s conclusion .                                0.701628
Emma had done .                                                                                                                                     0.701628
Emma wondered on what , of all the medley , she would fix .                                                                                         0.701628
" And I do envy him , Emma .                                                                                                                        0.701628
" My Emma !"                                                                                                                                        0.701628
" Such an imagination has crossed me , I own , Emma ; and if it never occurred to you before , you may as well take it into consideration now ."    0.701628
" Emma !"                                                                                                                                           0.701628
Emma denied none of it aloud , and agreed to none of it in private .                                                                                0.701628
Emma was in no danger of forgetting .                                                                                                               0.683578
Emma seriously hoped she would .                                                                                                                    0.671660
Name: 2, dtype: float64
Component 3:
" I am not fond of dinner - visiting ," said he " I never was .                                                                                                                                       0.676805
" It is to be a secret , I conclude ," said he .                                                                                                                                                      0.675223
" Colonel and Mrs . Campbell are to be in town again by midsummer ," said Jane .                                                                                                                      0.642224
" Well  if you please ," said Mrs . Weston rather hesitating , " if you think she will be of any use ."                                                                                               0.637882
" It is Frank and Miss Fairfax ," said Mrs . Weston .                                                                                                                                                 0.627133
" Do come with me ," said Mrs . Weston , " if it be not very disagreeable to you .                                                                                                                    0.620625
" Well ," said Mrs . Weston , laughing , " perhaps the greatest good he could do them , would be to give Jane such a respectable home ."                                                              0.611626
" I should not wonder ," said Mrs . Weston , " if Miss Fairfax were to have been drawn on beyond her own inclination , by her aunt ' s eagerness in accepting Mrs . Elton ' s civilities for her .    0.597601
" If you are very kind ," said he , " it will be one of the waltzes we danced last night ; let me live them over again .                                                                              0.569466
" Well ," said Mrs . Elton , laughing , " we shall see ."                                                                                                                                             0.556614
Name: 3, dtype: float64
Component 4:
CHAPTER XV       1.0
CHAPTER XII      1.0
CHAPTER XIV      1.0
CHAPTER XIX      1.0
CHAPTER XIII     1.0
CHAPTER XV       1.0
CHAPTER XIII     1.0
CHAPTER IV       1.0
CHAPTER VII      1.0
CHAPTER XVIII    1.0
Name: 4, dtype: float64

In [117]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(X_test_lsa) * np.asmatrix(X_test_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_test).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])


Key:
0 Mr . Woodhouse had so completely made up his mind to the visit , that in spite of the increasing coldness , he seemed to have no idea of shrinking from it , and set forward at last most punctually with his eldest daughter in his own carriage , with less apparent consciousness of the weather than either of the others ; too full of the wonder of his own going , and the pleasure it was to afford at Randalls to see that it was cold , and too well wrapt up to feel it .
1 " Oh !
2 " Oh no , no !
3 Such was Jane Fairfax ' s history .
4 " That has been a good deal the case , my dear ; but not to the degree you mention .
5 " And I am quite serious too , I assure you ," replied Mrs . Elton gaily , " in resolving to be always on the watch , and employing my friends to watch also , that nothing really unexceptionable may pass us ."
6 " And here is Mrs . Weston and Mr . Frank Churchill too ! Quite delightful ; so many friends !"
7 " You may well class the delight , the honour , and the comfort of such a situation together ," said Jane , " they are pretty sure to be equal ; however , I am very serious in not wishing any thing to be attempted at present for me .
8 Harriet , Mr . Elton , and Mr . Knightley , their own especial set , were the only persons invited to meet them ; the hours were to be early , as well as the numbers few ; Mr . Woodhouse ' s habits and inclination being consulted in every thing .
9 " Oh !

The heatmap shows that sentences 1,2 present similarity between them and with sentence 9 as oh! appears in all of them.

Drill 1: Tweaking tf-idf

When the threshold regarding the number of times that a word appears (min_df) increases (from 2 to 10), the noise is reduced reducing the number of features significantly (form >2000 to <1000) and the percentage of variance explained increases to 68%.

Dropping the words that appear in certain number of paragraphs (max_df) from (0.5 to 0.2) has a lower impact than the number of times a word appears in the text. It could be due to the fact that after cleaning the text from punctuation and stopwords there are not too many words that appear in more than one of the paragraphs. Increasing this threshold makes the number of features higher as it will be harder to find words that are repeated in a higher number of paragraphs.

The use of n-grams in a range 1-3 penalises the number of features used to have the same percentage of variance explained. More features are required to achieve the same level of variance.

Not using inverse frequency in the vectorizer increases the variance explained up to 75% with the same number of features than when it is used. In this case, max_df has a significant impact in the level of variance explained (73% for 0.1 to 76% with 0.9) with the same number of features (roughly 382-5)