In [1]:
# Import Dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import random
import re
import numpy as np
import nltk
In [2]:
text = ['The amount of pollution is increasing day by day.',
'The concert was just great.',
'I love to see Gordon Ramsey cook food.',
'Google DeepMind is introducing a new AI Technology.',
'AI robots are examples of great technology present today.',
'All of us were singing in the concert today.',
'We have launched campaigns to stop pollution and global warming.']
In [3]:
# Preprocess Dataset
dataset = [line.lower() for line in text]
In [4]:
dataset
Out[4]:
In [5]:
# Get TF-IDF for text
tfidf = TfidfVectorizer()
In [6]:
X = tfidf.fit_transform(dataset)
In [7]:
# Document no.: 0, Index of word in BoW Model: 35, TF-IDF Value: 0.231608612212116
print(X[0])
In [8]:
# n_components: Number of concepts, n_iter: Number of iterations
lsa = TruncatedSVD(n_components=4, n_iter=100)
In [9]:
lsa.fit(X)
Out[9]:
In [10]:
row1 = lsa.components_[0]
In [11]:
row1
Out[11]:
In [12]:
# Corresponding to each concept, which are the most important words
imp_terms = tfidf.get_feature_names()
In [13]:
imp_terms
Out[13]:
In [14]:
for i,comp in enumerate(lsa.components_):
componentTerms = zip(imp_terms,comp)
sortedTerms = sorted(componentTerms,key=lambda x: x[1], reverse=True)
sortedTerms = sortedTerms[:10]
print('\nConcept: ',i)
for term in sortedTerms:
print(term)
In [15]:
concept_words = {}
In [16]:
for i,comp in enumerate(lsa.components_):
componentTerms = zip(imp_terms,comp)
sortedTerms = sorted(componentTerms,key=lambda x: x[1], reverse=True)
sortedTerms = sortedTerms[:10]
concept_words['Concept ' + str(i)] = sortedTerms
In [17]:
for key in concept_words.keys():
sentence_scores = []
for sentence in dataset:
words = nltk.word_tokenize(sentence)
score = 0
for word in words:
for word_with_score in concept_words[key]:
if word == word_with_score[0]:
score += word_with_score[1]
sentence_scores.append(score)
print('\n'+key+':')
for sent_score in sentence_scores:
print(sent_score)