In [ ]:
from __future__ import division
import pickle
import os
import random
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('whitegrid')
from lentil import datatools
from lentil import models
from lentil import est
from lentil import evaluate
%matplotlib inline
In [ ]:
import logging
logging.getLogger().setLevel(logging.DEBUG)
Load an interaction history
In [ ]:
history_path = os.path.join('data', 'assistments_2009_2010.pkl')
In [ ]:
with open(history_path, 'rb') as f:
history = pickle.load(f)
In [ ]:
df = history.data
Train an embedding model on the interaction history and visualize the results
In [ ]:
embedding_dimension = 2
model = models.EmbeddingModel(
history,
embedding_dimension,
using_prereqs=True,
using_lessons=True,
using_bias=True,
learning_update_variance_constant=0.5)
estimator = est.EmbeddingMAPEstimator(
regularization_constant=1e-3,
using_scipy=True,
verify_gradient=False,
debug_mode_on=True,
ftol=1e-3)
model.fit(estimator)
In [ ]:
print "Training AUC = %f" % (evaluate.training_auc(
model, history, plot_roc_curve=True))
In [ ]:
split_history = history.split_interactions_by_type()
timestep_of_last_interaction = split_history.timestep_of_last_interaction
In [ ]:
NUM_STUDENTS_TO_SAMPLE = 10
for student_id in random.sample(df['student_id'].unique(), NUM_STUDENTS_TO_SAMPLE):
student_idx = history.idx_of_student_id(student_id)
timesteps = range(1, timestep_of_last_interaction[student_id]+1)
for i in xrange(model.embedding_dimension):
plt.plot(timesteps, model.student_embeddings[student_idx, i, timesteps],
label='Skill %d' % (i+1))
norms = np.linalg.norm(model.student_embeddings[student_idx, :, timesteps], axis=1)
plt.plot(timesteps, norms, label='norm')
plt.title('student_id = %s' % student_id)
plt.xlabel('Timestep')
plt.ylabel('Skill')
plt.legend(loc='upper right')
plt.show()
In [ ]:
assessment_norms = np.linalg.norm(model.assessment_embeddings, axis=1)
plt.xlabel('Assessment embedding norm')
plt.ylabel('Frequency (number of assessments)')
plt.hist(assessment_norms, bins=20)
plt.show()
In [ ]:
def get_pass_rates(grouped):
"""
Get pass rate for each group
:param pd.GroupBy grouped: A grouped dataframe
:rtype: dict[str, float]
:return: A dictionary mapping group name to pass rate
"""
pass_rates = {}
for name, group in grouped:
vc = group['outcome'].value_counts()
if True not in vc:
pass_rates[name] = 0
else:
pass_rates[name] = vc[True] / len(group)
return pass_rates
In [ ]:
grouped = df[df['module_type']==datatools.AssessmentInteraction.MODULETYPE].groupby('module_id')
pass_rates = get_pass_rates(grouped)
assessment_norms = [np.linalg.norm(model.assessment_embeddings[history.idx_of_assessment_id(assessment_id), :]) for assessment_id in pass_rates]
plt.xlabel('Assessment pass rate')
plt.ylabel('Assessment embedding norm')
plt.scatter(pass_rates.values(), assessment_norms)
plt.show()
In [ ]:
grouped = df[df['module_type']==datatools.AssessmentInteraction.MODULETYPE].groupby('module_id')
pass_rates = get_pass_rates(grouped)
bias_minus_norm = [model.assessment_biases[history.idx_of_assessment_id(
assessment_id)] - np.linalg.norm(
model.assessment_embeddings[history.idx_of_assessment_id(
assessment_id), :]) for assessment_id in pass_rates]
plt.xlabel('Assessment pass rate')
plt.ylabel('Assessment bias - Assessment embedding norm')
plt.scatter(pass_rates.values(), bias_minus_norm)
plt.show()
In [ ]:
grouped = df[df['module_type']==datatools.AssessmentInteraction.MODULETYPE].groupby('student_id')
pass_rates = get_pass_rates(grouped)
biases = [model.student_biases[history.idx_of_student_id(
student_id)] for student_id in pass_rates]
plt.xlabel('Student pass rate')
plt.ylabel('Student bias')
plt.scatter(pass_rates.values(), biases)
plt.show()
In [ ]:
lesson_norms = np.linalg.norm(model.lesson_embeddings, axis=1)
plt.xlabel('Lesson embedding norm')
plt.ylabel('Frequency (number of lessons)')
plt.hist(lesson_norms, bins=20)
plt.show()
In [ ]:
prereq_norms = np.linalg.norm(model.prereq_embeddings, axis=1)
plt.xlabel('Prereq embedding norm')
plt.ylabel('Frequency (number of lessons)')
plt.hist(prereq_norms, bins=20)
plt.show()
In [ ]:
plt.xlabel('Lesson embedding norm')
plt.ylabel('Prereq embedding norm')
plt.scatter(prereq_norms, lesson_norms)
plt.show()
In [ ]:
timesteps = range(model.student_embeddings.shape[2])
avg_student_norms = np.array(np.linalg.norm(np.mean(model.student_embeddings, axis=0), axis=0))
plt.xlabel('Timestep')
plt.ylabel('Average student embedding norm')
plt.plot(timesteps, avg_student_norms)
plt.show()
In [ ]: