In [1]:
from collections import defaultdict
from gensim import corpora, models, similarities
def make_dictionary(documents):
    """
    construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
    filter dictionary to remove stopwords and words occuring < min_count times
    
    input: documents is an iterable consisting of all the words in the corpus 
    output: filtered dictionary
    """

    
    dictionary = corpora.Dictionary(documents)

    stop_words = nltk.corpus.stopwords.words('english') 
    min_count = 2
    stop_ids = [dictionary.token2id[word] for word in stop_words
               if word in dictionary.token2id]
    rare_ids = [id for id, freq in dictionary.dfs.items()
                if freq < min_count]
    dictionary.filter_tokens(stop_ids + rare_ids)
    dictionary.compactify()
    return(dictionary)

def make_corpus(documents):
    """
    """
    dictionary = make_dictionary(documents)
    # convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
    corpus = [dictionary.doc2bow(words) for words in documents]
    return(corpus, dictionary)

In [2]:
import pandas as pd
import os

# Read datasets from file:

os.chdir('../data/')
# Read dataframe
input_fname="AutismParentMagazine-posts-tokens.csv"

# Get categories and ids from dataset
df = pd.read_csv(input_fname,index_col=0)
df.head(2)
ids=df.index

In [3]:
import pickle
# Read models and evaluate the score
lsi = models.LsiModel.load('lsi-model.save')
tfidf = models.TfidfModel.load('tfidf.save')
dictionary = pickle.load(open("dictionary.save", "rb"))
matsim = similarities.MatrixSimilarity.load('lsi-matsim.save')

In [4]:



child school
['child', 'school']

In [11]:
query="language speaking"

# Tokenize data
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Get list of tokens from text in first article:
text = query.lower()
ttext = tokenizer.tokenize(text)
print( text )
print( ttext )
vec_bow = dictionary.doc2bow(query.lower().split())

vec_lsi = lsi[tfidf[vec_bow]] # convert the query to LSI space

sims = matsim[vec_lsi]
print(sims)

for aid,score in sims:
    title=df['title'][aid]
    print("{}: {}".format(title,score))


language speaking
['language', 'speaking']
[(120, 0.81458425521850586), (191, 0.34226500988006592), (44, 0.31233096122741699), (31, 0.31233096122741699)]
Targeting Language Delays: 0.8145842552185059
Issue 19 – Power of Language: 0.3422650098800659
Successful Indian Head Massage for People with Special Needs: 0.312330961227417
Successful Indian Head Massage for People with Special Needs: 0.312330961227417

In [ ]: