In [2]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import re
import numpy as np
import pandas as pd
import sys
sys.path.append('/research/edubot/repo/edubot/utils/')
import utils
In this exercise we will infer the different topics of questions using LDA topic modelling.
In [12]:
#We have a pre-loaded file with all the questions.
file = open(utils.data_path+'lda/questions.txt', 'r')
questions = file.readlines()
file.close()
len(questions)
Out[12]:
An example of a question:
In [4]:
questions[0]
Out[4]:
We create a list of tokenized and stemmed questions, also removing stop-words.
In [6]:
norm_texts = [utils.tokenize_and_stem(question) for question in questions]
In [7]:
norm_texts[0]
Out[7]:
LDA needs a dictionary object, a mapping between words and numeric ids.
In [13]:
#Turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(norm_texts)
type(dictionary)
Out[13]:
In [17]:
#Saving the dictionary object to be used later.
dictionary.save(utils.data_path+'lda/lda_dictionary')
In [8]:
#Loading existing disctionary object.
dictionary = corpora.Dictionary.load(utils.data_path+'lda/lda_dictionary')
Now with our dictionary, we have to create a bag-of-words object.
In [9]:
# convert tokenized documents into a document-term matrix
bows = [dictionary.doc2bow(text) for text in norm_texts]
len(bows)
Out[9]:
An example of a bag-of-words element (id of word, frequency in sentence).
In [10]:
bows[0]
Out[10]:
Trying different parameters, we found that using 10 topics gives us results that make sense.
In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(bows, num_topics=10, id2word = dictionary, passes=20)
ldamodel.print_topics()
Out[14]:
In [14]:
ldamodel.save(utils.data_path+'lda/lda_model')
In [13]:
ldamodel = gensim.models.LdaModel.load(utils.data_path+'lda/lda_model')
We now create a list of (topic, question) for each question in our data set, using the most probable topic. We also use this to infer the topic of a complete new question
In [17]:
def create_topic_pars(pars, ldamodel, word_dictionary):
norm_pars = [utils.tokenize_and_stem(par) for par in pars]
print('created normalized paragraphs object of length %d' % len(norm_pars))
bows = [word_dictionary.doc2bow(text) for text in norm_pars]
print('created bag-of-words object of length %d' % len(bows))
topic_pars = []
for idx, val in enumerate(bows):
lda_vector = ldamodel[val]
#original LDA model topic (most relevant) and paragraph:
topic_pars.append([ldamodel.print_topic(max(lda_vector, key=lambda item: item[1])[0]), pars[idx]]) #we attach the original paragraph here, not the cleaned version that we used for LDA.
return(topic_pars)
In [18]:
topic_pars = create_topic_pars(questions, ldamodel, dictionary)
An example of the topic of an existing question
In [28]:
topic_pars[23000]
Out[28]:
Now we try with some complete new questions:
In [30]:
new_questions = ['hey, i really think we should get more time to answer the questions in our quizzes, this sucks!!! :((()))', 'I am really sad, dont know how im going to be happy again, anyone knows?', 'is there anyway i can improve my grammar online?', 'I really did not understand the difference between mean and median, anyone who can help me!!!']
In [31]:
new_topic_pars = create_topic_pars(new_questions, ldamodel, dictionary)
In [32]:
new_topic_pars
Out[32]:
This is clearly an english grammar question:
In [34]:
new_topic_pars[2]
Out[34]:
Lets check the answers to questions on our dataset with the same topic:
In [35]:
grammar_answers = [topic_par for topic_par in topic_pars if topic_par[0] == new_topic_pars[2][0]]
In [36]:
len(grammar_answers)
Out[36]:
In [37]:
grammar_answers[502:505]
Out[37]:
In [ ]: