Here I add forums and articles posts.
This follows the same steps as the previous notebook (improve-lda-model.ipynb)
In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models
from nlp_models import make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix
#,get_model_score_wforums
In [2]:
def get_model_score_wforums(ids,matsim,categories,titles):
""" Function to evalate the score for a given model, following the equation defined in validate_model.ipynb """
num_predictions=3
model_score=0
N=0
for id,doc in zip(ids,matsim.index):
sims=matsim[doc]
category1=eval(categories[id])
title1=titles[id]
if 'forums' in category1:
continue
i_pred=0
for other_id,score in sims:
#print("ID {} OTHER_ID {} SCORE {}".format(id,other_id,score))
category2=eval(categories[other_id])
title2=titles[other_id]
if 'forums' in category2:
continue
# Remove categories which are too general for this test:
cats_to_remove=['category-magazine-issues','category-general',
'category-autism-books','category-podcast','category-autism-news',
'category-personal-narrative','autism_advocacy']
for cat in cats_to_remove:
category1=set(category1)
if cat in category1:
category1.remove(cat)
category2=set(category2)
if cat in category2:
category2.remove(cat)
if (len(category1) == 0) | (len(category2) == 0):
continue
# Remove also 'magazine issues' are these are general articles:
if id != other_id:
i_pred=i_pred+1
if i_pred == num_predictions+1 :
break
#print("title1 {}\n title2{}".format(title1,title2))
#print("ID {} category{} \n ID {} category {}, score {}".format(id,category1,other_id,category2,score))
N=N+1
if any( x in category2 for x in category1):
model_score+=1
#else:
# print("ID {} category{} \n ID {} category {}, score {}".format(id,category1,other_id,category2,score))
#print("model_score {}, N {}".format(model_score,N))
model_score=model_score/N
return model_score
In [3]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [4]:
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"
# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
df.head(2)
Out[4]:
In [5]:
# Extract series from df:
categories=df['category']
titles=df['title']
ids=df.index
In [6]:
from ast import literal_eval
# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
documents[idoc]=literal_eval(str(documents[idoc]))
In [7]:
corpus,dictionary = make_corpus(documents)
tfidf = models.TfidfModel(corpus)
In [8]:
print("LSI model")
for num_topics in range(100,200,100):
matsim,lsi = make_lsi_similarity_matrix(tfidf[corpus], dictionary, num_topics)
model_score= get_model_score_wforums(ids,matsim,categories,titles)
print("N. topics {}, score {}".format(num_topics,model_score))
In [9]:
num_iter=500
x=[]
y=[]
for num_topics in range(10,200,10):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
model_score= get_model_score_wforums(ids,matsim,categories,titles)
print("N. topics {}, score {}".format(num_topics,model_score))
x.append(num_topics)
y.append(model_score)
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats
import numpy as np
In [11]:
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True
plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size
plt.rcParams['ytick.labelsize'] = label_size
fig = plt.figure()
#plt.subplots_adjust(left=0.2,right=1.0,bottom=0.17,top=0.9)
fig.set_size_inches(6,6)
#plt.suptitle('Readability score')
#fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')
#fig.text(0.4,0.04,'Readability score',va='center')
plt.xlabel('N. topics',fontsize=label_size)
plt.ylabel('Score',fontsize=label_size)
plt.plot(x, y)
plt.show()
In [ ]: