In [1]:
from collections import defaultdict
from gensim import corpora, models, similarities
def make_dictionary(documents):
"""
construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
filter dictionary to remove stopwords and words occuring < min_count times
input: documents is an iterable consisting of all the words in the corpus
output: filtered dictionary
"""
dictionary = corpora.Dictionary(documents)
stop_words = nltk.corpus.stopwords.words('english')
min_count = 2
stop_ids = [dictionary.token2id[word] for word in stop_words
if word in dictionary.token2id]
rare_ids = [id for id, freq in dictionary.dfs.items()
if freq < min_count]
dictionary.filter_tokens(stop_ids + rare_ids)
dictionary.compactify()
return(dictionary)
def make_corpus(documents):
"""
"""
dictionary = make_dictionary(documents)
# convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
corpus = [dictionary.doc2bow(words) for words in documents]
return(corpus, dictionary)
In [2]:
import pandas as pd
import os
# Read datasets from file:
os.chdir('../data/')
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"
# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
df.head(2)
ids=df.index
In [3]:
import pickle
# Read models and evaluate the score
lsi = models.LsiModel.load('lsi-model.save')
tfidf = models.TfidfModel.load('tfidf.save')
dictionary = pickle.load(open("dictionary.save", "rb"))
matsim = similarities.MatrixSimilarity.load('lsi-matsim.save')
In [4]:
In [11]:
query="language speaking"
# Tokenize data
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')
# Get list of tokens from text in first article:
text = query.lower()
ttext = tokenizer.tokenize(text)
print( text )
print( ttext )
vec_bow = dictionary.doc2bow(query.lower().split())
vec_lsi = lsi[tfidf[vec_bow]] # convert the query to LSI space
sims = matsim[vec_lsi]
print(sims)
for aid,score in sims:
title=df['title'][aid]
print("{}: {}".format(title,score))
In [ ]: