In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from pprint import pprint
import pandas as pd
import os
In [2]:
from collections import defaultdict
from gensim import corpora, models, similarities
# plot projection of articles onto 2 axes/topics defined by the model; for models operating on tfidf-transformed inputs (LSI, RP)
def plot_axes_with_tfidf(x, y, model, corpus, tfidf, titles):
"""Plot each article title according to the projection of its text
into the given x and y topic axes of model.
:param x: the index of the x axis to plot
:param y: the index of the y axis to plot
:param model: the gensim model to project into
:param corpus: the gensim corpus of documents
:param tfidf: a tfidf model for converting documents into tfidf space
:param titles: a list of article titles
"""
x_data = defaultdict(list)
y_data = defaultdict(list)
arts = defaultdict(list)
for title, doc in zip(titles, corpus):
x_data[0].append((model[tfidf[doc]][x][1]))
y_data[0].append((model[tfidf[doc]][y][1]))
arts[0].append(title)
plt.figure(figsize=(10, 10))
ax = plt.gca()
ax.set_xlabel('Topic '+str(x), fontsize=14)
ax.set_ylabel('Topic '+str(y), fontsize=14)
plt.scatter(x_data[0], y_data[0], s=40)
for art, x, y in zip(arts[0], x_data[0], y_data[0]):
ax.annotate(str(art), xy=(x, y), xycoords='data', xytext=(1, 1),
textcoords='offset points', size=10)
# plot projection of articles onto 2 axes/topics defined by the model; for models operating on original corpus (LDA, HDP)
def plot_axes(x, y, model, corpus, titles):
"""Plot each article title according to the projection of its text
into the given x and y topic axes of model.
:param x: the index of the x axis to plot
:param y: the index of the y axis to plot
:param model: the gensim model to project into
:param corpus: the gensim corpus of documents
:param titles: a list of article titles
"""
x_data = defaultdict(list)
y_data = defaultdict(list)
arts = defaultdict(list)
for title, doc in zip(titles, corpus):
x_data[0].append((model[doc][x][1]))
y_data[0].append((model[doc][y][1]))
arts[0].append(title)
plt.figure(figsize=(10, 10))
ax = plt.gca()
ax.set_xlabel('Topic '+str(x), fontsize=14)
ax.set_ylabel('Topic '+str(y), fontsize=14)
plt.scatter(x_data[0], y_data[0], s=40)
for art, x, y in zip(arts[0], x_data[0], y_data[0]):
ax.annotate(str(art), xy=(x, y), xycoords='data', xytext=(1, 1),
textcoords='offset points', size=10)
In [3]:
os.chdir('../data/')
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"
df = pd.read_csv(input_fname,index_col=0)
df.head(5)
Out[3]:
In [4]:
import pickle
# Read models
corpus = pickle.load(open("corpus.save", "rb"))
tfidf = models.TfidfModel.load('tfidf.save')
lsi = models.LsiModel.load('lsi-model.save')
In [5]:
# Plot topics
titles = df['title']
plot_axes_with_tfidf(x=1, y=2, model=lsi, corpus=corpus, tfidf=tfidf, titles=titles)
In [ ]: