I got a task to review the 4 text files and
The data is 4 wikipedia articles each relating to food. script.txt is an article about food transcript_1.txt is about Fast Food transcript_2.txt is about a resteraunt transcript_3.txt is about Cooking
I need to find out what the most important key words are, and also to define importance. Two options for what is important
I'll try 1, maybe 2 things (if the first doesn't work)
In [500]:
%pylab inline
import sklearn
import pandas as pd
import numpy as np
import nltk
import os
import re
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.pipeline import Pipeline
In [501]:
path ='./data'
d ={}
corpus = []
for name in os.listdir(path):
if name.endswith('.txt'):
f_path = os.path.join(path,name)
with open(f_path) as f:
d[name] = f.read()
corpus+=d[name].split('.')
path+='/transcripts'
for name in os.listdir(path):
if name.endswith('.txt'):
f_path = os.path.join(path,name)
with open(f_path) as f:
d[name] = f.read()
corpus+=d[name].split('.')
In [593]:
class Tokenizer():
def __init__(self):
self.lemma = nltk.stem.WordNetLemmatizer()
self.stem = nltk.stem.SnowballStemmer("english")
self.tokenizer =nltk.RegexpTokenizer(r'\w+')
self.reg = re.compile('\d+')
def proc_word(self,word):
word = self.reg.sub('',word)
word = self.lemma.lemmatize(word)
return word
def __call__(self,doc):
res = [self.proc_word(word) for word in self.tokenizer.tokenize(doc.lower())]
res = list(filter(lambda x:len(x)>3,res))
return res
T = Tokenizer()
T("hello the man is Great")
Out[593]:
In [503]:
tf_vectorizer = CountVectorizer(max_df=0.75,min_df=0.25, stop_words='english',tokenizer=Tokenizer(),ngram_range=(1,1))
tf = tf_vectorizer.fit(corpus)
tfed = tf.transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()
In [ ]:
In [504]:
lda = LatentDirichletAllocation(n_topics=len(d)*2, max_iter=6, learning_method='online', learning_offset=50.,random_state=0).fit(tfed)
In [505]:
def build_pipeline(num_docs):
'''
:param args: Args passed to the command line via argparser
:return: A pipeline that implements CountVectorizer and LDA with the args passed to argparser
'''
tf_vectorizer = CountVectorizer(
max_df=0.95, # Keep words that apear in only up to 95% of the documents (eg corpus specific stop words)
min_df=5, # Only use words that apear in at least 5 documents
stop_words='english',
tokenizer=Tokenizer(), # Use our custom tokenizer
ngram_range=(1, 3) #Use key words of length 1,2 or 3
)
lda = LatentDirichletAllocation(
n_topics=num_docs , # learn twice as many document as we have documents
max_iter=6,
learning_method='online',
learning_offset=50.,
random_state=0)
pipeline = Pipeline([('count_vectorizer', tf_vectorizer), ('lda', lda)])
return pipeline
In [506]:
pipeline =build_pipeline(len(d))
model = pipeline.fit(corpus)
In [507]:
model.named_steps["count_vectorizer"].get_feature_names()
Out[507]:
In [533]:
def display_topics(comps, feature_names, no_top_words):
for topic_idx, topic in enumerate(comps):
print ("Topic %d:" % (topic_idx))
print (" ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
In [536]:
no_top_words = 10
display_topics(model.named_steps["lda"].components_, model.named_steps["count_vectorizer"].get_feature_names(), no_top_words)
In [532]:
model.named_steps["lda"].components_
Out[532]:
In [510]:
D["script.txt"].argsort()[::-1]
Out[510]:
In [511]:
def pipeline(doc):
res = model.transform([doc])
return res[0]
In [512]:
res = {name:pipeline(doc) for name,doc in d.items()}
In [513]:
pipeline(d["pizza.txt"])[0]
Out[513]:
In [514]:
D =pd.DataFrame(res)
In [515]:
D.plot.bar(figsize=(20,10))
Out[515]:
In [781]:
D[D>0.1].T.plot.bar(figsize=(20,10),colormap='jet',title="Most important topics per document")
Out[781]:
In [517]:
sns.heatmap(D.T)
Out[517]:
In [518]:
normed = (D.T - D.T.mean())/D.T.std()
In [519]:
D
Out[519]:
In [520]:
normed[normed>1].T
Out[520]:
In [541]:
path ='./data/transcripts'
transcripts ={}
for name in os.listdir(path):
if name.endswith('.txt'):
f_path = os.path.join(path,name)
with open(f_path) as f:
transcripts[name] = f.read()
In [522]:
transcripts = {name:pipeline(doc) for name,doc in transcripts.items()}
In [523]:
X =pd.DataFrame(transcripts)
In [539]:
X['transcript_2.txt'].argmax()
Out[539]:
In [767]:
ax = X.T.plot.bar(figsize=(20,10),colormap='jet')
In [769]:
fig =ax.get_figure()
fig.savefig('/tmp/test.png')
In [790]:
corpus =[]
for key,val in transcripts.items():
corpus+=val.splitlines()
tfidf = TfidfVectorizer(tokenizer=Tokenizer(),stop_words='english',ngram_range=(1,4),smooth_idf=True)
tf_model = tfidf.fit(corpus)
script = tf_model.transform([d["script.txt"]])
tf_feature_names = tf_model.get_feature_names()
In [791]:
def display_word_ranks(vector, feature_names, no_top_words):
ranking_dict = {feature_names[i]: -np.log(vector[i]) #dictionary comprehension, key is word val is score
for i in vector.argsort() # iterate over indices sorted by the value
if vector[i] >0 # and only take values that are non zero e.g. appear in the document
}
return ranking_dict
z = script.toarray()
In [696]:
z[0]
Out[696]:
In [792]:
ranking_dict =(display_word_ranks(z[0],tf_feature_names,40))
In [795]:
S = pd.Series(ranking_dict)
S = S.sort_values(ascending=False)
S[S!=S.value_counts().values[0]][:20].plot.bar(title="Our top n words scored")
Out[795]:
In [789]:
T =S.sample(30).index
Y = S[T]
Y.sort_values(ascending=False).plot.bar(figsize=(15,5),title="Our top n words scored")
Out[789]:
In [750]:
S.rank(ascending=False,method='first')[:30]
Out[750]:
In [780]:
S[T].rank(ascending=False,method='first').sort_values()
Out[780]: