Here I show perplexity as a metric to find the optimal number of topics.
https://radimrehurek.com/gensim/models/ldamodel.html log_perplexity(chunk, total_docs=None) Calculate and return per-word likelihood bound, using the chunk of documents as evaluation corpus. Also output the calculated statistics. incl. perplexity=2^(-bound), to log at INFO level.
In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models
from nlp_models import make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix,get_model_score_wforums
In [2]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [3]:
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"
# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
df.head(2)
Out[3]:
In [4]:
# Extract series from df:
categories=df['category']
ids=df.index
In [5]:
from ast import literal_eval
# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
documents[idoc]=literal_eval(str(documents[idoc]))
In [6]:
corpus,dictionary = make_corpus(documents)
tfidf = models.TfidfModel(corpus)
In [7]:
msk = np.random.rand(len(corpus)) < 0.6
#corpus_train = documents[msk]
#corpus_test = documents[~msk]
#chunk=corpus[msk]
#print(len(corpus))
len_60p=int(len(corpus)*0.6)
chunk1=corpus[:len_60p]
chunk2=corpus[len_60p:]
In [8]:
num_topics=50
for num_iter in range(1000,10000,1000):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
perplexity1=np.power(2,-lda.log_perplexity(chunk1))
print("N. iter {}, perplexity {}".format(num_iter,perplexity1))
In [11]:
num_topics=400
x=[]
y=[]
for num_iter in range(100,1000,100):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
perplexity=np.power(2,-lda.log_perplexity(chunk1))/num_topics
x.append(num_iter)
y.append(perplexity)
print("N. iter {}, perplexity {}".format(num_iter,perplexity))
In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats
import numpy as np
In [21]:
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True
plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size
plt.rcParams['ytick.labelsize'] = label_size
fig = plt.figure()
#plt.subplots_adjust(left=0.2,right=1.0,bottom=0.17,top=0.9)
fig.set_size_inches(6,6)
#plt.suptitle('Readability score')
#fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')
#fig.text(0.4,0.04,'Readability score',va='center')
plt.xlabel('N. iterations',fontsize=label_size)
plt.ylabel('Perplexity',fontsize=label_size)
plt.plot(x, y)
plt.show()
In [ ]: