Here I add forums and articles posts.
This follows the same steps as the previous notebook (improve-lda-model.ipynb)


In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models

from nlp_models import make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix
#,get_model_score_wforums

In [2]:
def get_model_score_wforums(ids,matsim,categories,titles):
    """ Function to evalate the score for a given model, following the equation defined in validate_model.ipynb """
    num_predictions=3
    model_score=0
    N=0
    for id,doc in zip(ids,matsim.index):
        sims=matsim[doc]
        category1=eval(categories[id])
        title1=titles[id]
        if 'forums' in category1:
            continue
        i_pred=0
        for other_id,score in sims:

            #print("ID {} OTHER_ID {} SCORE {}".format(id,other_id,score))
            category2=eval(categories[other_id])
            title2=titles[other_id]
            if 'forums' in category2:
                continue

            # Remove categories which are too general for this test:
            cats_to_remove=['category-magazine-issues','category-general',
                            'category-autism-books','category-podcast','category-autism-news',
                            'category-personal-narrative','autism_advocacy']

            for cat in cats_to_remove:
                category1=set(category1)
                if cat in category1:
                    category1.remove(cat)
                category2=set(category2)
                if cat in category2:
                    category2.remove(cat)
            
            if (len(category1) == 0) | (len(category2) == 0):
                continue
            # Remove also 'magazine issues' are these are general articles:
                
            if id != other_id:
                i_pred=i_pred+1

                if i_pred ==  num_predictions+1 :
                    break
                #print("title1 {}\n title2{}".format(title1,title2))
                #print("ID {} category{} \n ID {} category {}, score {}".format(id,category1,other_id,category2,score))
                N=N+1
                if any( x in category2 for x in category1):
                    model_score+=1
                #else:
                #    print("ID {} category{} \n ID {} category {}, score {}".format(id,category1,other_id,category2,score))
                #print("model_score {}, N {}".format(model_score,N))
    model_score=model_score/N
    return model_score

In [3]:
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)

In [4]:
# Read dataframe
#input_fname="AutismParentMagazine-posts-tokens.csv"
input_fname="articles-n-forums-posts.csv"


# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)

df.head(2)


Out[4]:
category href source text title tokens
0 ['category-autism-therapy'] https://www.autismparentingmagazine.com/autism... https://www.autismparentingmagazine.com/ For children with autism spectrum disorder (AS... Autism, Head Banging and other Self Harming Be... ['for', 'children', 'with', 'autism', 'spectru...
1 ['category-autism-therapy'] https://www.autismparentingmagazine.com/high-q... https://www.autismparentingmagazine.com/ Dr. Stephen Shore once said “If you’ve met one... High Quality ABA Treatment:  What Every Parent... ['dr', 'stephen', 'shore', 'once', 'said', 'if...

In [5]:
# Extract series from df:
categories=df['category']
titles=df['title']
ids=df.index

In [6]:
from ast import literal_eval  

# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
    documents[idoc]=literal_eval(str(documents[idoc]))

In [7]:
corpus,dictionary = make_corpus(documents)

tfidf = models.TfidfModel(corpus)

In [8]:
print("LSI model")

for num_topics in range(100,200,100):
    matsim,lsi = make_lsi_similarity_matrix(tfidf[corpus], dictionary, num_topics)
    model_score= get_model_score_wforums(ids,matsim,categories,titles)
    print("N. topics {}, score {}".format(num_topics,model_score))


LSI model
N. topics 100, score 0.37333333333333335

In [9]:
num_iter=500
x=[]
y=[]
for num_topics in range(10,200,10):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics,num_iter)
    model_score= get_model_score_wforums(ids,matsim,categories,titles)
    print("N. topics {}, score {}".format(num_topics,model_score))
    x.append(num_topics)
    y.append(model_score)


N. topics 10, score 0.2835820895522388
N. topics 20, score 0.38636363636363635
N. topics 30, score 0.3076923076923077
N. topics 40, score 0.3617021276595745
N. topics 50, score 0.25
N. topics 60, score 0.2727272727272727
N. topics 70, score 0.375
N. topics 80, score 0.6
N. topics 90, score 0.2
N. topics 100, score 0.14285714285714285
N. topics 110, score 0.11764705882352941
N. topics 120, score 0.42857142857142855
N. topics 130, score 0.1
N. topics 140, score 0.0
N. topics 150, score 0.75
N. topics 160, score 0.16666666666666666
N. topics 170, score 0.5555555555555556
N. topics 180, score 0.6666666666666666
N. topics 190, score 0.47368421052631576

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.mlab as mlab
from matplotlib import gridspec
from scipy.stats import norm
from scipy.optimize import curve_fit
from lognormal import lognormal, lognormal_stats
import numpy as np

In [11]:
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.unicode'] = True

plt.rcParams.update({'font.size': 24})
label_size = 18
plt.rcParams['xtick.labelsize'] = label_size 
plt.rcParams['ytick.labelsize'] = label_size 


fig = plt.figure()
#plt.subplots_adjust(left=0.2,right=1.0,bottom=0.17,top=0.9)
fig.set_size_inches(6,6)


#plt.suptitle('Readability score')
#fig.text(0.04,0.5,'Distribution',va='center',rotation='vertical')
#fig.text(0.4,0.04,'Readability score',va='center')

plt.xlabel('N. topics',fontsize=label_size)
plt.ylabel('Score',fontsize=label_size)


plt.plot(x, y)

plt.show()



In [ ]: