notebook.community

Edit and run



In [1]:

    
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models

from nlp_models import get_model_score,make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix



In [2]:

    
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)



In [3]:

    
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"

# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)

# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates()
df=df.reset_index(drop=True)
df.head(2)









    Out[3]:







  
    
      
      title
      source
      category
      text
      href
      tokens
    
  
  
    
      0
      Autism, Head Banging and other Self Harming Be...
      https://www.autismparentingmagazine.com/
      category-applied-behavior-analysis-aba
      For children with autism spectrum disorder (AS...
      https://www.autismparentingmagazine.com/autism...
      ['for', 'children', 'with', 'autism', 'spectru...
    
    
      1
      High Quality ABA Treatment:  What Every Parent...
      https://www.autismparentingmagazine.com/
      category-applied-behavior-analysis-aba
      Dr. Stephen Shore once said “If you’ve met one...
      https://www.autismparentingmagazine.com/high-q...
      ['dr', 'stephen', 'shore', 'once', 'said', 'if...



In [4]:

    
# Are there articles in several categories?
for ii in df.index:
    title=df.loc[ii,['title']].values[0]
    rows=df.loc[df['title'] == title]
    ncategory=len(pd.unique(rows['category']))
    if ncategory > 1 :
        print("Title {}".format(title))
        print(pd.unique(rows['category']))









    



Title Autism
['category-autism-articles' 'category-general']
Title Autism
['category-autism-articles' 'category-general']

Only one article is found into two categories, the two categories are very similar, so I better merge them.



In [5]:

    
# Join the two categories above into one.
cat1='category-autism-articles'
cat2='category-general'

row_index=df.loc[df['category']==cat1].index
for row in row_index:
    df.loc[row,['category']]=cat2

#df.loc[df['category']==cat2]



In [6]:

    
# Extract series from df:
categories=df['category']
ids=df.index



In [7]:

    
from ast import literal_eval  

# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
    documents[idoc]=literal_eval(str(documents[idoc]))



In [11]:

    
corpus,dictionary = make_corpus(documents)


tfidf = models.TfidfModel(corpus)
#tfidf.save('tfidf.save')
print("LSI model")

for num_topics in range(100,500,100):
    matsim,lsi = make_lsi_similarity_matrix(tfidf[corpus], dictionary, num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))









    



LSI model
N. topics 100, score 0.6901408450704225
N. topics 200, score 0.6901408450704225
N. topics 300, score 0.6901408450704225
N. topics 400, score 0.6901408450704225



In [12]:

    
print("LDA model")

for num_topics in range(100,500,100):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))









    



LDA model
N. topics 100, score 0.4413145539906103
N. topics 200, score 0.2269170579029734
N. topics 300, score 0.215962441314554
N. topics 400, score 0.13615023474178403



In [13]:

    
for num_topics in range(10,100,10):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))









    



N. topics 10, score 0.7151799687010955
N. topics 20, score 0.6447574334898278
N. topics 30, score 0.6791862284820032
N. topics 40, score 0.6103286384976526
N. topics 50, score 0.5837245696400626
N. topics 60, score 0.5305164319248826
N. topics 70, score 0.5054773082942097
N. topics 80, score 0.46322378716744916
N. topics 90, score 0.5117370892018779



In [14]:

    
for num_topics in range(2,10,2):
    matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
    model_score= get_model_score(ids,matsim,categories)
    print("N. topics {}, score {}".format(num_topics,model_score))









    



N. topics 2, score 0.5852895148669797
N. topics 4, score 0.6901408450704225
N. topics 6, score 0.7856025039123631
N. topics 8, score 0.755868544600939



In [ ]:

	title	source	category	text	href	tokens
0	Autism, Head Banging and other Self Harming Be...	https://www.autismparentingmagazine.com/	category-applied-behavior-analysis-aba	For children with autism spectrum disorder (AS...	https://www.autismparentingmagazine.com/autism...	['for', 'children', 'with', 'autism', 'spectru...
1	High Quality ABA Treatment: What Every Parent...	https://www.autismparentingmagazine.com/	category-applied-behavior-analysis-aba	Dr. Stephen Shore once said “If you’ve met one...	https://www.autismparentingmagazine.com/high-q...	['dr', 'stephen', 'shore', 'once', 'said', 'if...