Here I add forums and articles posts.
This follows the same steps as the previous notebook (improve-lda-model.ipynb)
In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models
from nlp_models import make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix,get_model_score_wforums
In [2]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [3]:
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"
# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
df.head(2)
Out[3]:
In [4]:
# Extract series from df:
categories=df['category']
titles=df['title']
ids=df.index
In [5]:
from ast import literal_eval
# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
documents[idoc]=literal_eval(str(documents[idoc]))
In [6]:
#corpus,dictionary = make_corpus(documents)
#tfidf = models.TfidfModel(corpus)
#Save corpus into file
import pickle
tfidf_fname='tfidf.save'
dictionary_fname='dictionary.save'
corpus_fname='corpus.save'
# Read models and evaluate the score
corpus = pickle.load(open(corpus_fname, "rb"))
tfidf = models.TfidfModel.load(tfidf_fname)
dictionary = pickle.load(open(dictionary_fname, "rb"))
In [7]:
print("LSI model")
for num_topics in range(10,100,10):
matsim,lsi = make_lsi_similarity_matrix(tfidf[corpus], dictionary, num_topics)
model_score= get_model_score_wforums(ids,matsim,categories,titles)
print("N. topics {}, score {}".format(num_topics,model_score))
In [9]:
save=True
num_iter=500
x=[]
y=[]
for num_topics in range(10,200,10):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
model_score= get_model_score_wforums(ids,matsim,categories,titles)
if save:
fname="lda-matsim-{}.save".format(num_topics)
pickle.dump(matsim,open(fname,"wb"))
fname='lda-model-{}.save'.format(num_topics)
lda.save(fname)
print("N. topics {}, score {}".format(num_topics,model_score))
x.append(num_topics)
y.append(model_score)
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats
import numpy as np
In [11]:
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True
plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size
plt.rcParams['ytick.labelsize'] = label_size
fig = plt.figure()
fig.set_size_inches(6,6)
plt.xlabel('N. topics',fontsize=label_size)
plt.ylabel('Score',fontsize=label_size)
plt.plot(x, y)
plt.show()
In [ ]: