In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from pprint import pprint
import pandas as pd
import os

In [2]:
from collections import defaultdict
from gensim import corpora, models, similarities

# plot projection of articles onto 2 axes/topics defined by the model; for models operating on tfidf-transformed inputs (LSI, RP)
def plot_axes_with_tfidf(x, y, model, corpus, tfidf, titles):
    """Plot each article title according to the projection of its text 
    into the given x and y topic axes of model.
    
    :param x: the index of the x axis to plot
    :param y: the index of the y axis to plot
    :param model: the gensim model to project into
    :param corpus: the gensim corpus of documents
    :param tfidf: a tfidf model for converting documents into tfidf space
    :param titles: a list of article titles
    """
    x_data = defaultdict(list) 
    y_data = defaultdict(list) 
    arts = defaultdict(list)  
    for title, doc in zip(titles, corpus):
        x_data[0].append((model[tfidf[doc]][x][1]))
        y_data[0].append((model[tfidf[doc]][y][1]))
        arts[0].append(title)
    plt.figure(figsize=(10, 10))
    ax = plt.gca()
    ax.set_xlabel('Topic '+str(x), fontsize=14)
    ax.set_ylabel('Topic '+str(y), fontsize=14)
    plt.scatter(x_data[0], y_data[0], s=40)
    for art, x, y in zip(arts[0], x_data[0], y_data[0]):
        ax.annotate(str(art), xy=(x, y), xycoords='data', xytext=(1, 1), 
        textcoords='offset points', size=10)
    
        
# plot projection of articles onto 2 axes/topics defined by the model; for models operating on original corpus (LDA, HDP)
def plot_axes(x, y, model, corpus, titles):
    """Plot each article title according to the projection of its text 
    into the given x and y topic axes of model.
    
    :param x: the index of the x axis to plot
    :param y: the index of the y axis to plot
    :param model: the gensim model to project into
    :param corpus: the gensim corpus of documents
    :param titles: a list of article titles
    """
    x_data = defaultdict(list) 
    y_data = defaultdict(list) 
    arts = defaultdict(list)  
    for title, doc in zip(titles, corpus):
        x_data[0].append((model[doc][x][1]))
        y_data[0].append((model[doc][y][1]))
        arts[0].append(title)
    plt.figure(figsize=(10, 10))
    ax = plt.gca()
    ax.set_xlabel('Topic '+str(x), fontsize=14)
    ax.set_ylabel('Topic '+str(y), fontsize=14)
    plt.scatter(x_data[0], y_data[0], s=40)
    for art, x, y in zip(arts[0], x_data[0], y_data[0]):
        ax.annotate(str(art), xy=(x, y), xycoords='data', xytext=(1, 1), 
        textcoords='offset points', size=10)

In [3]:
os.chdir('../data/')
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"

df = pd.read_csv(input_fname,index_col=0)
df.head(5)


Out[3]:
title source category text href tokens
0 Autism, Head Banging and other Self Harming Be... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba For children with autism spectrum disorder (AS... https://www.autismparentingmagazine.com/autism... ['for', 'children', 'with', 'autism', 'spectru...
1 High Quality ABA Treatment:  What Every Parent... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba Dr. Stephen Shore once said “If you’ve met one... https://www.autismparentingmagazine.com/high-q... ['dr', 'stephen', 'shore', 'once', 'said', 'if...
2 Help: I Don’t Know How to Choose an Applied Be... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba Help! I am going to be starting Applied Behav... https://www.autismparentingmagazine.com/choosi... ['help', 'i', 'am', 'going', 'to', 'be', 'star...
3 HELP: My Autistic Child is Absolutely Terrifie... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba How do you handle high anxiety of a child on t... https://www.autismparentingmagazine.com/help-a... ['how', 'do', 'you', 'handle', 'high', 'anxiet...
4 HELP: I Need Communication Advice for Autistic... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba A grandfather from Singapore asks… My eldest g... https://www.autismparentingmagazine.com/help-i... ['a', 'grandfather', 'from', 'singapore', 'ask...

In [4]:
import pickle
# Read models
corpus = pickle.load(open("corpus.save", "rb"))
tfidf = models.TfidfModel.load('tfidf.save')
lsi = models.LsiModel.load('lsi-model.save')

In [5]:
# Plot topics
titles = df['title']
plot_axes_with_tfidf(x=1, y=2, model=lsi, corpus=corpus, tfidf=tfidf, titles=titles)



In [ ]: