Here I show perplexity as a metric to find the optimal number of topics.
https://radimrehurek.com/gensim/models/ldamodel.html log_perplexity(chunk, total_docs=None) Calculate and return per-word likelihood bound, using the chunk of documents as evaluation corpus. Also output the calculated statistics. incl. perplexity=2^(-bound), to log at INFO level.
In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models
from nlp_models import make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix,get_model_score_wforums
In [2]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [3]:
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"
# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
df.head(2)
Out[3]:
In [4]:
# Extract series from df:
categories=df['category']
ids=df.index
In [5]:
from ast import literal_eval
# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
documents[idoc]=literal_eval(str(documents[idoc]))
In [6]:
corpus,dictionary = make_corpus(documents)
tfidf = models.TfidfModel(corpus)
In [7]:
msk = np.random.rand(len(corpus)) < 0.6
#corpus_train = documents[msk]
#corpus_test = documents[~msk]
#chunk=corpus[msk]
#print(len(corpus))
len_60p=int(len(corpus)*0.6)
chunk1=corpus[:len_60p]
chunk2=corpus[len_60p:]
In [8]:
num_iter=2000
for num_topics in range(2,10,2):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
#model_score= get_model_score_wforums(ids,matsim,categories)
perplexity1=np.power(2,-lda.log_perplexity(chunk1))
#perplexity2=np.power(2,-lda.log_perplexity(chunk2))
print("N. topics {}, perplexity {}".format(num_topics,perplexity1))
In [9]:
for num_topics in range(10,100,10):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
perplexity1=np.power(2,-lda.log_perplexity(chunk1))
#perplexity2=np.power(2,-lda.log_perplexity(chunk2))
print("N. topics {}, perplexity {}".format(num_topics,perplexity1))
In [ ]: