In [2]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import re
import numpy as np
import pandas as pd
import sys
sys.path.append('/research/edubot/repo/edubot/utils/')
import utils

Exploration of data with LDA

In this exercise we will infer the different topics of questions using LDA topic modelling.

Reading the data


In [12]:
#We have a pre-loaded file with all the questions.
file = open(utils.data_path+'lda/questions.txt', 'r')
questions = file.readlines()
file.close()
len(questions)


Out[12]:
23838

An example of a question:


In [4]:
questions[0]


Out[4]:
'Hi, I am <<FULLNAME>> <<FULLNAME>>.  I am a retired medical worker living in East Tennessee near the Smokey Mountains.  I have an abiding interest in all things spiritual and psychological.  I am also an InterFaith minister, that never really practiced that profession except among my friends.  A close friend is also doing this course for her social worker credits which is how I found out about it.  I like doing online courses where the participants share their experiences in an online group like this.  Looking forward to getting to know you all. Love and blessings,  CRose\n'

We create a list of tokenized and stemmed questions, also removing stop-words.


In [6]:
norm_texts = [utils.tokenize_and_stem(question) for question in questions]

In [7]:
norm_texts[0]


Out[7]:
['hi',
 'fullnam',
 'fullnam',
 'retir',
 'medic',
 'worker',
 'live',
 'east',
 'tennesse',
 'near',
 'smokey',
 'mountain',
 'abid',
 'interest',
 'thing',
 'spiritu',
 'psycholog',
 'also',
 'interfaith',
 'minist',
 'never',
 'realli',
 'practic',
 'profess',
 'except',
 'among',
 'friend',
 'close',
 'friend',
 'also',
 'cours',
 'social',
 'worker',
 'credit',
 'found',
 'like',
 'onlin',
 'cours',
 'particip',
 'share',
 'experi',
 'onlin',
 'group',
 'like',
 'look',
 'forward',
 'get',
 'know',
 'love',
 'bless',
 'crose']

Create Dictionary object

LDA needs a dictionary object, a mapping between words and numeric ids.


In [13]:
#Turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(norm_texts)
type(dictionary)


Out[13]:
gensim.corpora.dictionary.Dictionary

In [17]:
#Saving the dictionary object to be used later.
dictionary.save(utils.data_path+'lda/lda_dictionary')

In [8]:
#Loading existing disctionary object.
dictionary = corpora.Dictionary.load(utils.data_path+'lda/lda_dictionary')

Create BOW object

Now with our dictionary, we have to create a bag-of-words object.


In [9]:
# convert tokenized documents into a document-term matrix
bows = [dictionary.doc2bow(text) for text in norm_texts]
len(bows)


Out[9]:
23838

An example of a bag-of-words element (id of word, frequency in sentence).


In [10]:
bows[0]


Out[10]:
[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 2),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 2),
 (32, 1),
 (33, 2),
 (34, 1),
 (35, 1),
 (36, 2),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 2),
 (42, 1),
 (43, 2)]

Train the LDA model

Trying different parameters, we found that using 10 topics gives us results that make sense.


In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(bows, num_topics=10, id2word = dictionary, passes=20)
ldamodel.print_topics()


Out[14]:
[(0,
  '0.035*"get" + 0.027*"cours" + 0.022*"student" + 0.018*"take" + 0.018*"even" + 0.017*"submit" + 0.017*"respons" + 0.017*"start" + 0.016*"help" + 0.015*"question"'),
 (1,
  '0.023*"0" + 0.019*"1" + 0.019*"3" + 0.016*"featur" + 0.015*"peer" + 0.014*"2" + 0.011*"pass" + 0.011*"hw4" + 0.011*"4" + 0.011*"10"'),
 (2,
  '0.043*"video" + 0.032*"1" + 0.029*"2" + 0.027*"http" + 0.025*"com" + 0.024*"error" + 0.019*"submiss" + 0.017*"youtub" + 0.017*"post" + 0.015*"time"'),
 (3,
  '0.035*"write" + 0.016*"read" + 0.016*"english" + 0.016*"homework" + 0.014*"fullnam" + 0.013*"m" + 0.013*"book" + 0.012*"s" + 0.012*"cours" + 0.012*"sentenc"'),
 (4,
  '0.019*"peopl" + 0.009*"one" + 0.009*"use" + 0.009*"children" + 0.009*"can" + 0.009*"will" + 0.008*"problem" + 0.008*"mani" + 0.008*"countri" + 0.008*"world"'),
 (5,
  '0.043*"movi" + 0.028*"public" + 0.018*"us" + 0.015*"govern" + 0.013*"select" + 0.011*"rate" + 0.011*"world" + 0.010*"servic" + 0.010*"rest" + 0.010*"internet"'),
 (6,
  '0.016*"v" + 0.012*"one" + 0.011*"s" + 0.009*"exampl" + 0.009*"apolog" + 0.009*"end" + 0.008*"guess" + 0.008*"mother" + 0.008*"say" + 0.007*"depend"'),
 (7,
  '0.025*"t" + 0.022*"thank" + 0.021*"can" + 0.016*"answer" + 0.014*"question" + 0.013*"cours" + 0.013*"use" + 0.012*"pleas" + 0.011*"m" + 0.011*"week"'),
 (8,
  '0.029*"happi" + 0.016*"feel" + 0.014*"can" + 0.013*"peopl" + 0.013*"life" + 0.012*"think" + 0.012*"t" + 0.010*"s" + 0.010*"make" + 0.009*"person"'),
 (9,
  '0.019*"rb" + 0.017*"certif" + 0.016*"essay" + 0.013*"t" + 0.013*"follow" + 0.012*"caus" + 0.012*"fatal" + 0.011*"draft" + 0.010*"exist" + 0.009*"key"')]

In [14]:
ldamodel.save(utils.data_path+'lda/lda_model')

In [13]:
ldamodel = gensim.models.LdaModel.load(utils.data_path+'lda/lda_model')

Running the model in old and new data

We now create a list of (topic, question) for each question in our data set, using the most probable topic. We also use this to infer the topic of a complete new question


In [17]:
def create_topic_pars(pars, ldamodel, word_dictionary):
    norm_pars = [utils.tokenize_and_stem(par) for par in pars]
    print('created normalized paragraphs object of length %d' % len(norm_pars))
    bows = [word_dictionary.doc2bow(text) for text in norm_pars]
    print('created bag-of-words object of length %d' % len(bows))
    topic_pars = []
    for idx, val in enumerate(bows):
        lda_vector = ldamodel[val]
        #original LDA model topic (most relevant) and paragraph:
        topic_pars.append([ldamodel.print_topic(max(lda_vector, key=lambda item: item[1])[0]), pars[idx]]) #we attach the original paragraph here, not the cleaned version that we used for LDA.
    return(topic_pars)

In [18]:
topic_pars = create_topic_pars(questions, ldamodel, dictionary)


created normalized paragraphs object of length 23838
created bag-of-words object of length 23838

An example of the topic of an existing question


In [28]:
topic_pars[23000]


Out[28]:
['0.025*"t" + 0.022*"thank" + 0.021*"can" + 0.016*"answer" + 0.014*"question" + 0.013*"cours" + 0.013*"use" + 0.012*"pleas" + 0.011*"m" + 0.011*"week"',
 "Hello.I submitted my solution to HW1 and the auto grader runs some tests for 'invalid' guesses. I have logic in my hangperson_game.rb file that sets a instance variable '@valid' to either true or false. How does the autograder expect the sinatra app to behave when given an invalid guess?Thanks for your help<<FULLNAME>>\n"]

Now we try with some complete new questions:


In [30]:
new_questions = ['hey, i really think we should get more time to answer the questions in our quizzes, this sucks!!! :((()))', 'I am really sad, dont know how im going to be happy again, anyone knows?', 'is there anyway i can improve my grammar online?', 'I really did not understand the difference between mean and median, anyone who can help me!!!']

In [31]:
new_topic_pars = create_topic_pars(new_questions, ldamodel, dictionary)


created normalized paragraphs object of length 4
created bag-of-words object of length 4

In [32]:
new_topic_pars


Out[32]:
[['0.019*"peopl" + 0.009*"one" + 0.009*"use" + 0.009*"children" + 0.009*"can" + 0.009*"will" + 0.008*"problem" + 0.008*"mani" + 0.008*"countri" + 0.008*"world"',
  'hey, i really think we should get more time to answer the questions in our quizzes, this sucks!!! :((()))'],
 ['0.029*"happi" + 0.016*"feel" + 0.014*"can" + 0.013*"peopl" + 0.013*"life" + 0.012*"think" + 0.012*"t" + 0.010*"s" + 0.010*"make" + 0.009*"person"',
  'I am really sad, dont know how im going to be happy again, anyone knows?'],
 ['0.035*"write" + 0.016*"read" + 0.016*"english" + 0.016*"homework" + 0.014*"fullnam" + 0.013*"m" + 0.013*"book" + 0.012*"s" + 0.012*"cours" + 0.012*"sentenc"',
  'is there anyway i can improve my grammar online?'],
 ['0.029*"happi" + 0.016*"feel" + 0.014*"can" + 0.013*"peopl" + 0.013*"life" + 0.012*"think" + 0.012*"t" + 0.010*"s" + 0.010*"make" + 0.009*"person"',
  'I really did not understand the difference between mean and median, anyone who can help me!!!']]

This is clearly an english grammar question:


In [34]:
new_topic_pars[2]


Out[34]:
['0.035*"write" + 0.016*"read" + 0.016*"english" + 0.016*"homework" + 0.014*"fullnam" + 0.013*"m" + 0.013*"book" + 0.012*"s" + 0.012*"cours" + 0.012*"sentenc"',
 'is there anyway i can improve my grammar online?']

Lets check the answers to questions on our dataset with the same topic:


In [35]:
grammar_answers = [topic_par for topic_par in topic_pars if topic_par[0] == new_topic_pars[2][0]]

In [36]:
len(grammar_answers)


Out[36]:
1358

In [37]:
grammar_answers[502:505]


Out[37]:
[['0.035*"write" + 0.016*"read" + 0.016*"english" + 0.016*"homework" + 0.014*"fullnam" + 0.013*"m" + 0.013*"book" + 0.012*"s" + 0.012*"cours" + 0.012*"sentenc"',
  'Dependent clauses go first:1. Because Marina and Tolya arrived at the airport before noon, I did not see them there.2. Because he slept during the day, Nick had problems to fall asleep at night.3. After they left the hotel room, Nick and <<FULLNAME>> realized that they forgot the keys insideIndependent clauses go first:1. The astronaut said that people will live on other planets someday.2. I love to drink coffee because it gives me energy in the morning.3. This assignment is easy if you have taken grammar course.\n'],
 ['0.035*"write" + 0.016*"read" + 0.016*"english" + 0.016*"homework" + 0.014*"fullnam" + 0.013*"m" + 0.013*"book" + 0.012*"s" + 0.012*"cours" + 0.012*"sentenc"',
  'I found this type of advise very useful.  I try to listen as much English in movies, especially the ones I love and from there I get ideas and vocabulary needed.\n'],
 ['0.035*"write" + 0.016*"read" + 0.016*"english" + 0.016*"homework" + 0.014*"fullnam" + 0.013*"m" + 0.013*"book" + 0.012*"s" + 0.012*"cours" + 0.012*"sentenc"',
  'Hi! I am <<FULLNAME>> From Costa Rica, I hope improve on my English. Thanks for this opportunity!Regards for all!\n']]

In [ ]: