In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models

from nlp_models import get_model_score,make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix

In [2]:
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)

In [3]:
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"

# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)

# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates()
df=df.reset_index(drop=True)
df.head(2)


Out[3]:
title source category text href tokens
0 Autism, Head Banging and other Self Harming Be... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba For children with autism spectrum disorder (AS... https://www.autismparentingmagazine.com/autism... ['for', 'children', 'with', 'autism', 'spectru...
1 High Quality ABA Treatment:  What Every Parent... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba Dr. Stephen Shore once said “If you’ve met one... https://www.autismparentingmagazine.com/high-q... ['dr', 'stephen', 'shore', 'once', 'said', 'if...

In [4]:
# Are there articles in several categories?
for ii in df.index:
    title=df.loc[ii,['title']].values[0]
    rows=df.loc[df['title'] == title]
    ncategory=len(pd.unique(rows['category']))
    if ncategory > 1 :
        print("Title {}".format(title))
        print(pd.unique(rows['category']))


Title Autism
['category-autism-articles' 'category-general']
Title Autism
['category-autism-articles' 'category-general']

Only one article is found into two categories, the two categories are very similar, so I better merge them.


In [5]:
# Join the two categories above into one.
cat1='category-autism-articles'
cat2='category-general'

row_index=df.loc[df['category']==cat1].index
for row in row_index:
    df.loc[row,['category']]=cat2

#df.loc[df['category']==cat2]

In [6]:
# Extract series from df:
categories=df['category']
ids=df.index

In [7]:
from ast import literal_eval  

# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
    documents[idoc]=literal_eval(str(documents[idoc]))

In [11]:
corpus,dictionary = make_corpus(documents)


tfidf = models.TfidfModel(corpus)
#tfidf.save('tfidf.save')
print("LSI model")

for num_topics in range(100,500,100):
    matsim,lsi = make_lsi_similarity_matrix(tfidf[corpus], dictionary, num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))


LSI model
N. topics 100, score 0.6901408450704225
N. topics 200, score 0.6901408450704225
N. topics 300, score 0.6901408450704225
N. topics 400, score 0.6901408450704225

In [12]:
print("LDA model")

for num_topics in range(100,500,100):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))


LDA model
N. topics 100, score 0.4413145539906103
N. topics 200, score 0.2269170579029734
N. topics 300, score 0.215962441314554
N. topics 400, score 0.13615023474178403

In [13]:
for num_topics in range(10,100,10):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))


N. topics 10, score 0.7151799687010955
N. topics 20, score 0.6447574334898278
N. topics 30, score 0.6791862284820032
N. topics 40, score 0.6103286384976526
N. topics 50, score 0.5837245696400626
N. topics 60, score 0.5305164319248826
N. topics 70, score 0.5054773082942097
N. topics 80, score 0.46322378716744916
N. topics 90, score 0.5117370892018779

In [14]:
for num_topics in range(2,10,2):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))


N. topics 2, score 0.5852895148669797
N. topics 4, score 0.6901408450704225
N. topics 6, score 0.7856025039123631
N. topics 8, score 0.755868544600939

In [ ]: