Perplexity

Here I show perplexity as a metric to find the optimal number of topics.

https://radimrehurek.com/gensim/models/ldamodel.html log_perplexity(chunk, total_docs=None) Calculate and return per-word likelihood bound, using the chunk of documents as evaluation corpus. Also output the calculated statistics. incl. perplexity=2^(-bound), to log at INFO level.

https://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity

http://qpleple.com/perplexity-to-evaluate-topic-models/


In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models

from nlp_models import make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix,get_model_score_wforums

In [2]:
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)

In [3]:
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"


# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)

df.head(2)


Out[3]:
category href source text title user id tokens text_short
0 ['category-applied-behavior-analysis-aba'] https://www.autismparentingmagazine.com/autism... https://www.autismparentingmagazine.com/ For children with autism spectrum disorder (AS... Autism, Head Banging and other Self Harming Be... NaN ['autism', 'head', 'banging', 'and', 'other', ... For children with autism spectrum disorder (AS...
1 ['category-applied-behavior-analysis-aba'] https://www.autismparentingmagazine.com/high-q... https://www.autismparentingmagazine.com/ Dr. Stephen Shore once said “If you’ve met one... High Quality ABA Treatment:  What Every Parent... NaN ['high', 'quality', 'aba', 'treatment', 'what'... Dr. Stephen Shore once said “If you’ve met one...

In [4]:
# Extract series from df:
ids=df.index

In [5]:
from ast import literal_eval  

# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
    documents[idoc]=literal_eval(str(documents[idoc]))

In [6]:
corpus,dictionary = make_corpus(documents)


tfidf = models.TfidfModel(corpus)

In [7]:
msk = np.random.rand(len(corpus)) < 0.6

#corpus_train = documents[msk]
#corpus_test = documents[~msk]
#chunk=corpus[msk]

#print(len(corpus))
len_60p=int(len(corpus)*0.6)
chunk1=corpus[:len_60p]
chunk2=corpus[len_60p:]

In [9]:
num_iter=500
x=[]
y=[]

In [ ]:
for num_topics in range(20,200,20):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
    perplexity1=np.power(2,-lda.log_perplexity(chunk1))
    perplexity1=perplexity1/num_topics
    #perplexity2=np.power(2,-lda.log_perplexity(chunk2))
    x.append(num_topics)
    y.append(perplexity1)

    print("N. topics {}, perplexity {}".format(num_topics,perplexity1))


N. topics 20, perplexity 17.300786807048528
N. topics 40, perplexity 11.722117529927342

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats
import numpy as np

In [ ]:
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True

plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size 
plt.rcParams['ytick.labelsize'] = label_size 


fig = plt.figure()
#plt.subplots_adjust(left=0.2,right=1.0,bottom=0.17,top=0.9)
fig.set_size_inches(6,6)


#plt.suptitle('Readability score')
#fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')
#fig.text(0.4,0.04,'Readability score',va='center')

plt.xlabel('N. topics',fontsize=label_size)
plt.ylabel('Perplexity',fontsize=label_size)


plt.plot(x, y)

plt.show()

In [ ]: