In [1]:
import sys, os
import nltk
import pandas as pd
In [2]:
from collections import defaultdict
from gensim import corpora, models, similarities
def make_dictionary(documents):
"""
construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
filter dictionary to remove stopwords and words occuring < min_count times
input: documents is an iterable consisting of all the words in the corpus
output: filtered dictionary
"""
dictionary = corpora.Dictionary(documents)
stop_words = nltk.corpus.stopwords.words('english')
min_count = 2
stop_ids = [dictionary.token2id[word] for word in stop_words
if word in dictionary.token2id]
rare_ids = [id for id, freq in dictionary.dfs.items()
if freq < min_count]
dictionary.filter_tokens(stop_ids + rare_ids)
dictionary.compactify()
return(dictionary)
def make_corpus(documents):
"""
"""
dictionary = make_dictionary(documents)
# convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
corpus = [dictionary.doc2bow(words) for words in documents]
return(corpus, dictionary)
def make_lsi_similarity_matrix(tfidf_corpus, dictionary):
"""
construct LSI (latent semantic indexing) model on Tfidf-transformed corpus, print model topics,
return similarity matrix.
"""
# construct model
lsi = models.lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
lsi.save('lsi-model.save')
# create similarity matrix
matsim = similarities.MatrixSimilarity(lsi[tfidf_corpus], num_best=1000)
return(matsim)
def make_lda_similarity_matrix(corpus, dictionary):
"""
Latent Dirichlet Allocation (LDA) model
"""
# construct model
lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=300)
lda.save('lda-model.save')
# create similarity matrix
matsim = similarities.MatrixSimilarity(lda[corpus], num_best=1000)
return(matsim)
In [3]:
# Read database of data
os.chdir('../data')
output_fname="articles-n-forums-posts.csv"
# Read articles from file
input_fname="AutismParentMagazine-posts-clean.csv"
df=pd.read_csv(input_fname,index_col=0)
df.head(2)
df.index.name='post id'
In [4]:
input_fname="MedHelp-posts.csv"
df2=pd.read_csv(input_fname,index_col=0)
df2['source']='http://www.medhelp.org'
df2['category']='forums'
#
del df2['user id']
#Remove questions from forums:
df3=df2.drop(df2.loc[df2.index == df2['mother post id']].index)
del df3['mother post id']
df2=df3
del df3
In [5]:
df2.head(5)
Out[5]:
In [6]:
# Join datasets
df=df.append(df2,ignore_index=True)
In [7]:
# Join the two very similar categories into one.
cat1='category-general'
cat2='category-autism-articles'
row_index=df.loc[df['category']==cat2].index
for row in row_index:
df.loc[row,['category']]=cat1
cat1='category-autism-therapy'
cat2='category-applied-behavior-analysis-aba'
row_index=df.loc[df['category']==cat2].index
for row in row_index:
df.loc[row,['category']]=cat1
cat1='category-autism-and-diet'
cat2='category-autism-and-food'
row_index=df.loc[df['category']==cat2].index
for row in row_index:
df.loc[row,['category']]=cat1
In [8]:
# Make categories, a list of categories in case of duplicates:
# Find list of unique titles
unique_titles=df['title'].unique()
# Find list of categories for each title:
dic_category={}
for title in unique_titles:
cat_list=df.loc[df['title']==title]['category'].values
# Remove repated elements, by converting to set and back to list
cat_set=set(cat_list)
cat_list=list(cat_set)
dic_category[title]=cat_list
In [9]:
# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates('title')
df=df.reset_index(drop=True)
In [10]:
for ii in df.index:
title=df.loc[ii,['title']].values[0]
category=dic_category[title]
df.loc[ii,['category']]=str(category)
In [11]:
df.head(2)
Out[11]:
In [12]:
# Tokenize data
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')
# Get list of tokens from text in first article:
text = df['text'][0].lower()
# Add also title:
title = df['title'][0].lower()
ttext = tokenizer.tokenize(title+" "+text)
print( text )
print( ttext )
In [13]:
# Get a column with list of tokens:
# 1) convert to lower case
# 2) get tokens
# 2) save data in a new column (tokens)
#df['tokens'] = df['text'].map(lambda x: tokenizer.tokenize(x.lower()))
# Join title and text into one column
df2=df[['title','text']].apply(lambda x: ','.join(x.astype(str)),axis=1)
df['tokens'] = df2.map(lambda x: tokenizer.tokenize(x.lower()))
del df2
In [14]:
# Short version of text
df['text_short']=df['text'].apply(lambda x: x[:300] if (len(x) > 300) else x)
In [15]:
print(df.loc[0,'text'])
print(df.loc[0,'text_short'])
In [16]:
df['tokens'].head(5)
# Save dataframe with tokens into files
df.to_csv(output_fname)
In [17]:
# Get similarity matrices
documents = df['tokens'].values
print(documents[:3])
corpus,dictionary = make_corpus(documents)
#Save corpus into file
import pickle
pickle.dump(dictionary,open("dictionary.save","wb"))
pickle.dump(corpus,open("corpus.save", "wb"))
tfidf = models.TfidfModel(corpus)
tfidf.save('tfidf.save')
lsi_matsim = make_lsi_similarity_matrix(tfidf[corpus], dictionary)
lda_matsim = make_lda_similarity_matrix(corpus, dictionary)
# The models are saved into files in the above routines
# Save similarity matrices too:
pickle.dump(lsi_matsim,open("lsi-matsim.save","wb"))
pickle.dump(lda_matsim,open("lda-matsim.save","wb"))
In [ ]: