notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
from pprint import pprint
import pandas as pd
import os



In [2]:

    
from collections import defaultdict
from gensim import corpora, models, similarities

# plot projection of articles onto 2 axes/topics defined by the model; for models operating on tfidf-transformed inputs (LSI, RP)
def plot_axes_with_tfidf(x, y, model, corpus, tfidf, titles):
    """Plot each article title according to the projection of its text 
    into the given x and y topic axes of model.
    
    :param x: the index of the x axis to plot
    :param y: the index of the y axis to plot
    :param model: the gensim model to project into
    :param corpus: the gensim corpus of documents
    :param tfidf: a tfidf model for converting documents into tfidf space
    :param titles: a list of article titles
    """
    x_data = defaultdict(list) 
    y_data = defaultdict(list) 
    arts = defaultdict(list)  
    for title, doc in zip(titles, corpus):
        x_data[0].append((model[tfidf[doc]][x][1]))
        y_data[0].append((model[tfidf[doc]][y][1]))
        arts[0].append(title)
    plt.figure(figsize=(10, 10))
    ax = plt.gca()
    ax.set_xlabel('Topic '+str(x), fontsize=14)
    ax.set_ylabel('Topic '+str(y), fontsize=14)
    plt.scatter(x_data[0], y_data[0], s=40)
    for art, x, y in zip(arts[0], x_data[0], y_data[0]):
        ax.annotate(str(art), xy=(x, y), xycoords='data', xytext=(1, 1), 
        textcoords='offset points', size=10)
    
        
# plot projection of articles onto 2 axes/topics defined by the model; for models operating on original corpus (LDA, HDP)
def plot_axes(x, y, model, corpus, titles):
    """Plot each article title according to the projection of its text 
    into the given x and y topic axes of model.
    
    :param x: the index of the x axis to plot
    :param y: the index of the y axis to plot
    :param model: the gensim model to project into
    :param corpus: the gensim corpus of documents
    :param titles: a list of article titles
    """
    x_data = defaultdict(list) 
    y_data = defaultdict(list) 
    arts = defaultdict(list)  
    for title, doc in zip(titles, corpus):
        x_data[0].append((model[doc][x][1]))
        y_data[0].append((model[doc][y][1]))
        arts[0].append(title)
    plt.figure(figsize=(10, 10))
    ax = plt.gca()
    ax.set_xlabel('Topic '+str(x), fontsize=14)
    ax.set_ylabel('Topic '+str(y), fontsize=14)
    plt.scatter(x_data[0], y_data[0], s=40)
    for art, x, y in zip(arts[0], x_data[0], y_data[0]):
        ax.annotate(str(art), xy=(x, y), xycoords='data', xytext=(1, 1), 
        textcoords='offset points', size=10)



In [3]:

    
os.chdir('../data/')
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"

df = pd.read_csv(input_fname,index_col=0)
df.head(2)









    Out[3]:







  
    
      
      category
      href
      source
      text
      title
      user id
      href_short
      tokens
      text_short
    
    
      post id
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      ['category-applied-behavior-analysis-aba']
      https://www.autismparentingmagazine.com/autism...
      https://www.autismparentingmagazine.com/
      For children with autism spectrum disorder (AS...
      Autism, Head Banging and other Self Harming Be...
      NaN
      AutismParentingMagazine
      ['autism', 'head', 'bang', 'and', 'other', 'se...
      For children with autism spectrum disorder (AS...
    
    
      1
      ['category-applied-behavior-analysis-aba']
      https://www.autismparentingmagazine.com/high-q...
      https://www.autismparentingmagazine.com/
      Dr. Stephen Shore once said “If you’ve met one...
      High Quality ABA Treatment:  What Every Parent...
      NaN
      AutismParentingMagazine
      ['high', 'quality', 'aba', 'treatment', 'what'...
      Dr. Stephen Shore once said “If you’ve met one...



In [32]:

    
import pickle
# Read models
corpus = pickle.load(open("corpus.save", "rb"))
tfidf = models.TfidfModel.load('tfidf.save')
lsimodel = models.LsiModel.load('lsi-model.save')
dictionary = pickle.load(open("dictionary.save","rb"))



In [5]:

    
# Plot topics
titles = df['title']
categories = df['category']

topics_articles=[]
topics_forums=[]
for category,title, doc in zip(categories,titles, corpus):
    top_topic=max(lsimodel[tfidf[doc]],key=lambda x:x[1])
    if category == 'forums':
        topics_forums.append(top_topic)
    else:
        topics_articles.append(top_topic)



In [6]:

    
print(df.loc[0])
lsimodel.show_topic(289,topn=10)









    



category             ['category-applied-behavior-analysis-aba']
href          https://www.autismparentingmagazine.com/autism...
source                 https://www.autismparentingmagazine.com/
text          For children with autism spectrum disorder (AS...
title         Autism, Head Banging and other Self Harming Be...
user id                                                     NaN
href_short                              AutismParentingMagazine
tokens        ['autism', 'head', 'bang', 'and', 'other', 'se...
text_short    For children with autism spectrum disorder (AS...
Name: 0, dtype: object






    Out[6]:





[('around', 0.12305158080174289),
 ('even', 0.098364952544514064),
 ('make', 0.092559040791065847),
 ('date', -0.089450466733059231),
 ('cat', 0.087545662636350441),
 ('maybe', -0.083542283033303466),
 ('dx', 0.080109234670521737),
 ('fight', 0.078660670623541679),
 ('13', 0.076877130095156287),
 ('water', 0.074540367371097843)]



In [7]:

    
topic_index=0
topn=5 #top five words in that topic
lsimodel.show_topic(topic_index, topn=topn)









    Out[7]:





[('son', 0.13063794954097041),
 ('school', 0.12704517649180411),
 ('get', 0.12431454448172294),
 ('autism', 0.11642186607889504),
 ('go', 0.11317346349203078)]



In [8]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
import numpy as np



In [25]:

    
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True

plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size 
plt.rcParams['ytick.labelsize'] = label_size 
plt.rcParams['axes.linewidth'] = 5 # set the value globally
plt.rc('axes', linewidth=2)

fig = plt.figure()
gs = gridspec.GridSpec(2, 1)

plt.subplots_adjust(left=0.2,right=1.0,bottom=0.17,top=0.9)
fig.set_size_inches(6,6)


#plt.suptitle('Readability score')
fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')
fig.text(0.4,0.04,'Topic',va='center')









    Out[25]:





<matplotlib.text.Text at 0x13336ef98>






    





<matplotlib.figure.Figure at 0x13336ee48>



In [26]:

    
nbins=300 #300 topics



In [29]:

    
#X ticks

xmax=320
x=np.arange(0,xmax,20) #xtics
xx=np.arange(1,xmax,1)

# Panel 1
ax1=plt.subplot(gs[0])
ax1.set_xlim([0, xmax])
ax1.set_ylim([0,0.02])
ax1.set_xticks(x)
ax1.tick_params(labelbottom='off')    


plt.ylabel('Distribution',fontsize=18)
#Class 0
X=np.array(topics_articles)
n_articles,bins_articles,patches=plt.hist(X,nbins,normed=1,facecolor='blue',align='mid',label='articles')
#plt.legend(bbox_to_anchor=(0.45, 0.95), loc=2, borderaxespad=0., fontsize=14)



# Panel 2
ax2=plt.subplot(gs[1])
ax2.set_xlim([0, xmax])
ax2.set_xticks(x)
ax2.set_ylim([0,0.02])

plt.ylabel('Distribution',fontsize=18)
plt.xlabel('Topic',fontsize=18)

#Class 1
X=np.array(topics_forums)
n_forums,bins_forums,patches=plt.hist(X,nbins,normed=1,facecolor='orange',align='mid',label='forums')

#plt.legend(bbox_to_anchor=(0.45, 0.95), loc=2, borderaxespad=0.,fontsize=14)

fig.savefig('topic-presence.jpeg', dpi=300)
#fig.savefig('topic-presence.jpeg')

plt.show()



In [12]:

    
#fig.facecolor="white"
#fig.savefig('topic-presence.jpeg', dpi=300)



In [13]:

    
# Find top 10 topics in articles and forums
print(len(bins_forums))
print(len(n_forums[0]))
#print(n_forums[0])
print(max(topics_forums))
print(max(topics_articles))









    



301
300
(994, 0.076488552732848214)
(957, 0.079762229154744374)



In [14]:

    
def mk_topic_presence(nn):
    from operator import itemgetter

    """gets the topic presence. The result is sorteed.
    nn refers to the output n of plt.hist, which contains the topic distribution for each bin"""
    topic_presence=[]
    for n in range(len(n_forums[0])):
        topic_presence.append((n,nn[0][n]))
    # Sort topic presence:
    sorted_topic_presence=sorted(topic_presence,key=itemgetter(1),reverse=True)
    return sorted_topic_presence

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
    
def print_topics(topic_presence_list):
    topn=10 #top five words in that topic
    for topic_presence in topic_presence_list[:5]:
        topic_index=topic_presence[0]
        topic=lsimodel.show_topic(topic_index, topn=topn)
        word_list=[]
        for word,score in topic:
            word_list.append(word)
        printmd('**Topic {}**: {}'.format(topic_index,', '.join(word_list)))



In [15]:

    
topic_presence_articles=mk_topic_presence(n_articles)
topic_presence_articles[:5]









    Out[15]:





[(0, 0.16413319926657599),
 (4, 0.011829419766960432),
 (2, 0.01182941976696043),
 (1, 0.0073933873543502691),
 (18, 0.0044360324126101651)]



In [16]:

    
topic_presence_forums=mk_topic_presence(n_forums)
topic_presence_forums[:5]









    Out[16]:





[(0, 0.22153120744670041),
 (1, 0.011686279291913093),
 (3, 0.0054197237295828858),
 (2, 0.0046575750801102897),
 (12, 0.0045728918968355597)]



In [17]:

    
printmd("**Topics most present in articles**")
print_topics(topic_presence_articles)
printmd("**Topics most present in forums**")
print_topics(topic_presence_forums)









    




Topics most present in articles







    




Topic 0: son, school, get, autism, go, know, like, child, say, help







    




Topic 4: aspergers, discussion, school, relate, syndrome, start, people, asperger, old, speech







    




Topic 2: http, com, www, aspergers, discussion, asperger, autism, relate, son, children







    




Topic 1: aspergers, disorder, sensory, diagnosis, spectrum, speech, difficulties, children, child, discussion







    




Topic 18: im, vaccines, adhd, thank, disorder, service, diagnosis, brother, dont, children







    




Topics most present in forums







    




Topic 0: son, school, get, autism, go, know, like, child, say, help







    




Topic 1: aspergers, disorder, sensory, diagnosis, spectrum, speech, difficulties, children, child, discussion







    




Topic 3: discussion, relate, asperger, start, syndrome, sensory, aspergers, http, speech, violence







    




Topic 2: http, com, www, aspergers, discussion, asperger, autism, relate, son, children







    




Topic 12: therapy, music, aspergers, diet, vaccines, school, sensory, say, doctor, flap



In [33]:

    
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()


# This is only for LDA
#data = pyLDAvis.gensim.prepare(lsimodel, corpus, dictionary)
#data









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-33-7847ca726aa7> in <module>()
      3 pyLDAvis.enable_notebook()
      4 
----> 5 data = pyLDAvis.gensim.prepare(lsimodel, corpus, dictionary)
      6 data
      7 

/Users/rangel/anaconda3/envs/cdips2017/lib/python3.6/site-packages/pyLDAvis/gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
    109     See `pyLDAvis.prepare` for **kwargs.
    110     """
--> 111     opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
    112     return vis_prepare(**opts)

/Users/rangel/anaconda3/envs/cdips2017/lib/python3.6/site-packages/pyLDAvis/gensim.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
     40           gamma = topic_model.inference(corpus)
     41       else:
---> 42           gamma, _ = topic_model.inference(corpus)
     43       doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
     44 

AttributeError: 'LsiModel' object has no attribute 'inference'



In [ ]:

	category	href	source	text	title	user id	href_short	tokens	text_short
post id
0	['category-applied-behavior-analysis-aba']	https://www.autismparentingmagazine.com/autism...	https://www.autismparentingmagazine.com/	For children with autism spectrum disorder (AS...	Autism, Head Banging and other Self Harming Be...	NaN	AutismParentingMagazine	['autism', 'head', 'bang', 'and', 'other', 'se...	For children with autism spectrum disorder (AS...
1	['category-applied-behavior-analysis-aba']	https://www.autismparentingmagazine.com/high-q...	https://www.autismparentingmagazine.com/	Dr. Stephen Shore once said “If you’ve met one...	High Quality ABA Treatment: What Every Parent...	NaN	AutismParentingMagazine	['high', 'quality', 'aba', 'treatment', 'what'...	Dr. Stephen Shore once said “If you’ve met one...