Latent Semantic Analysis

Latent semantic analysis (LSA) is a technique in natural language processing, in particular distributional semantics, of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms.


In [1]:
# Import Dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import random
import re
import numpy as np
import nltk

In [2]:
text = ['The amount of pollution is increasing day by day.',
       'The concert was just great.',
       'I love to see Gordon Ramsey cook food.',
       'Google DeepMind is introducing a new AI Technology.',
       'AI robots are examples of great technology present today.',
       'All of us were singing in the concert today.',
       'We have launched campaigns to stop pollution and global warming.']

In [3]:
# Preprocess Dataset
dataset = [line.lower() for line in text]

In [4]:
dataset


Out[4]:
['the amount of pollution is increasing day by day.',
 'the concert was just great.',
 'i love to see gordon ramsey cook food.',
 'google deepmind is introducing a new ai technology.',
 'ai robots are examples of great technology present today.',
 'all of us were singing in the concert today.',
 'we have launched campaigns to stop pollution and global warming.']

In [5]:
# Get TF-IDF for text
tfidf = TfidfVectorizer()

In [6]:
X = tfidf.fit_transform(dataset)

In [7]:
# Document no.: 0, Index of word in BoW Model: 35, TF-IDF Value: 0.231608612212116
print(X[0])


  (0, 35)	0.231608612212116
  (0, 2)	0.326425447033964
  (0, 26)	0.231608612212116
  (0, 27)	0.270961154226111
  (0, 21)	0.270961154226111
  (0, 19)	0.326425447033964
  (0, 9)	0.652850894067928
  (0, 5)	0.326425447033964

In [8]:
# n_components: Number of concepts, n_iter: Number of iterations
lsa = TruncatedSVD(n_components=4, n_iter=100)

In [9]:
lsa.fit(X)


Out[9]:
TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
       random_state=None, tol=0.0)

In [10]:
row1 = lsa.components_[0]

In [11]:
row1


Out[11]:
array([0.21978169, 0.15860868, 0.10022888, 0.01694285, 0.15936535,
       0.10022888, 0.01694285, 0.29718798, 0.00386627, 0.20045775,
       0.10540448, 0.15936535, 0.00386627, 0.01694285, 0.10540448,
       0.00386627, 0.29781608, 0.01694285, 0.15860868, 0.10022888,
       0.10540448, 0.17069334, 0.19941208, 0.01694285, 0.00386627,
       0.10540448, 0.29672746, 0.09726259, 0.15936535, 0.00386627,
       0.15936535, 0.00386627, 0.15860868, 0.01694285, 0.21978169,
       0.32514182, 0.01727336, 0.26394575, 0.15860868, 0.01694285,
       0.19941208, 0.01694285, 0.15860868])

In [12]:
# Corresponding to each concept, which are the most important words
imp_terms = tfidf.get_feature_names()

In [13]:
imp_terms


Out[13]:
['ai',
 'all',
 'amount',
 'and',
 'are',
 'by',
 'campaigns',
 'concert',
 'cook',
 'day',
 'deepmind',
 'examples',
 'food',
 'global',
 'google',
 'gordon',
 'great',
 'have',
 'in',
 'increasing',
 'introducing',
 'is',
 'just',
 'launched',
 'love',
 'new',
 'of',
 'pollution',
 'present',
 'ramsey',
 'robots',
 'see',
 'singing',
 'stop',
 'technology',
 'the',
 'to',
 'today',
 'us',
 'warming',
 'was',
 'we',
 'were']

In [14]:
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(imp_terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x: x[1], reverse=True)
    sortedTerms = sortedTerms[:10]
    print('\nConcept: ',i)
    for term in sortedTerms:
        print(term)


Concept:  0
('the', 0.32514181559191574)
('great', 0.29781607563038975)
('concert', 0.29718797776320965)
('of', 0.29672746406942835)
('today', 0.26394575325530384)
('technology', 0.21978169272259968)
('ai', 0.21978169272259956)
('day', 0.20045775332963428)
('just', 0.1994120779518987)
('was', 0.1994120779518987)

Concept:  1
('to', 0.3485938243525486)
('pollution', 0.24010551449293846)
('cook', 0.21080444346498864)
('food', 0.21080444346498864)
('gordon', 0.21080444346498864)
('love', 0.21080444346498864)
('ramsey', 0.21080444346498864)
('see', 0.21080444346498864)
('and', 0.2091446642680138)
('campaigns', 0.2091446642680138)

Concept:  2
('ai', 0.32926821071879864)
('technology', 0.32926821071879847)
('deepmind', 0.2697281954559743)
('google', 0.2697281954559743)
('introducing', 0.2697281954559743)
('new', 0.2697281954559743)
('is', 0.2055757082124677)
('are', 0.12693944933958315)
('examples', 0.12693944933958315)
('present', 0.12693944933958315)

Concept:  3
('day', 0.43627235382552015)
('pollution', 0.2448711875222656)
('by', 0.21813617691276008)
('increasing', 0.21813617691276008)
('amount', 0.21813617691275994)
('is', 0.17556655541341218)
('the', 0.08812364307404918)
('campaigns', 0.0768588273747975)
('global', 0.0768588273747975)
('have', 0.0768588273747975)

In [15]:
concept_words = {}

In [16]:
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(imp_terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x: x[1], reverse=True)
    sortedTerms = sortedTerms[:10]
    concept_words['Concept ' + str(i)] = sortedTerms

In [17]:
for key in concept_words.keys():
    sentence_scores = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        sentence_scores.append(score)
    print('\n'+key+':')
    for sent_score in sentence_scores:
        print(sent_score)


Concept 0:
1.0227847863206128
1.3189700248893126
0
0.43956338544519924
1.2980526784003212
1.1830030106798575
0

Concept 1:
0.24010551449293846
0
1.6134204851424805
0
0
0
1.0069886673815147

Concept 2:
0.2055757082124677
0
0
1.9430249114739622
1.0393547694563465
0
0

Concept 3:
2.035514624399047
0.08812364307404918
0
0.17556655541341218
0
0.08812364307404918
0.47544766964665813