In [1]:
import pandas as pd
import os
import numpy as np
from gensim import corpora, models, similarities
from gensim import models
from nlp_models import get_model_score,make_corpus,\
make_lsi_similarity_matrix,make_lda_similarity_matrix
In [2]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [3]:
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"
# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates()
df=df.reset_index(drop=True)
df.head(2)
Out[3]:
In [4]:
# Are there articles in several categories?
for ii in df.index:
title=df.loc[ii,['title']].values[0]
rows=df.loc[df['title'] == title]
ncategory=len(pd.unique(rows['category']))
if ncategory > 1 :
print("Title {}".format(title))
print(pd.unique(rows['category']))
Only one article is found into two categories, the two categories are very similar, so I better merge them.
In [5]:
# Join the two categories above into one.
cat1='category-autism-articles'
cat2='category-general'
row_index=df.loc[df['category']==cat1].index
for row in row_index:
df.loc[row,['category']]=cat2
#df.loc[df['category']==cat2]
In [6]:
# Extract series from df:
categories=df['category']
ids=df.index
In [7]:
from ast import literal_eval
# Get similarity matrices
documents = df['tokens'].values
for idoc in range(len(documents)):
documents[idoc]=literal_eval(str(documents[idoc]))
In [11]:
corpus,dictionary = make_corpus(documents)
tfidf = models.TfidfModel(corpus)
#tfidf.save('tfidf.save')
print("LSI model")
for num_topics in range(100,500,100):
matsim,lsi = make_lsi_similarity_matrix(tfidf[corpus], dictionary, num_topics)
model_score= get_model_score(ids,matsim,categories)
print("N. topics {}, score {}".format(num_topics,model_score))
In [12]:
print("LDA model")
for num_topics in range(100,500,100):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
model_score= get_model_score(ids,matsim,categories)
print("N. topics {}, score {}".format(num_topics,model_score))
In [13]:
for num_topics in range(10,100,10):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
model_score= get_model_score(ids,matsim,categories)
print("N. topics {}, score {}".format(num_topics,model_score))
In [14]:
for num_topics in range(2,10,2):
matsim,lda = make_lda_similarity_matrix(corpus, dictionary,num_topics)
model_score= get_model_score(ids,matsim,categories)
print("N. topics {}, score {}".format(num_topics,model_score))
In [ ]: