In [ ]:
    
from __future__ import division
import pickle
import os
import random
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('whitegrid')
from lentil import datatools
from lentil import models
from lentil import est
from lentil import evaluate
%matplotlib inline
    
In [ ]:
    
import logging
logging.getLogger().setLevel(logging.DEBUG)
    
Load an interaction history
In [ ]:
    
history_path = os.path.join('data', 'assistments_2009_2010.pkl')
    
In [ ]:
    
with open(history_path, 'rb') as f:
    history = pickle.load(f)
    
In [ ]:
    
df = history.data
    
Train an embedding model on the interaction history and visualize the results
In [ ]:
    
embedding_dimension = 2
model = models.EmbeddingModel(
    history, 
    embedding_dimension,
    using_prereqs=True,
    using_lessons=True,
    using_bias=True,
    learning_update_variance_constant=0.5)
estimator = est.EmbeddingMAPEstimator(
    regularization_constant=1e-3,
    using_scipy=True,
    verify_gradient=False,
    debug_mode_on=True,
    ftol=1e-3)
model.fit(estimator)
    
In [ ]:
    
print "Training AUC = %f" % (evaluate.training_auc(
        model, history, plot_roc_curve=True))
    
In [ ]:
    
split_history = history.split_interactions_by_type()
timestep_of_last_interaction = split_history.timestep_of_last_interaction
    
In [ ]:
    
NUM_STUDENTS_TO_SAMPLE = 10
for student_id in random.sample(df['student_id'].unique(), NUM_STUDENTS_TO_SAMPLE):
    student_idx = history.idx_of_student_id(student_id)
    timesteps = range(1, timestep_of_last_interaction[student_id]+1)
    for i in xrange(model.embedding_dimension):
        plt.plot(timesteps, model.student_embeddings[student_idx, i, timesteps], 
                 label='Skill %d' % (i+1))
    
    norms = np.linalg.norm(model.student_embeddings[student_idx, :, timesteps], axis=1)
    plt.plot(timesteps, norms, label='norm')
    
    plt.title('student_id = %s' % student_id)
    plt.xlabel('Timestep')
    plt.ylabel('Skill')
    plt.legend(loc='upper right')
    plt.show()
    
In [ ]:
    
assessment_norms = np.linalg.norm(model.assessment_embeddings, axis=1)
plt.xlabel('Assessment embedding norm')
plt.ylabel('Frequency (number of assessments)')
plt.hist(assessment_norms, bins=20)
plt.show()
    
In [ ]:
    
def get_pass_rates(grouped):
    """
    Get pass rate for each group
    
    :param pd.GroupBy grouped: A grouped dataframe
    :rtype: dict[str, float]
    :return: A dictionary mapping group name to pass rate
    """
    pass_rates = {}
    for name, group in grouped:
        vc = group['outcome'].value_counts()
        if True not in vc:
            pass_rates[name] = 0
        else:
            pass_rates[name] = vc[True] / len(group)
    return pass_rates
    
In [ ]:
    
grouped = df[df['module_type']==datatools.AssessmentInteraction.MODULETYPE].groupby('module_id')
pass_rates = get_pass_rates(grouped)
assessment_norms = [np.linalg.norm(model.assessment_embeddings[history.idx_of_assessment_id(assessment_id), :]) for assessment_id in pass_rates]
plt.xlabel('Assessment pass rate')
plt.ylabel('Assessment embedding norm')
plt.scatter(pass_rates.values(), assessment_norms)
plt.show()
    
In [ ]:
    
grouped = df[df['module_type']==datatools.AssessmentInteraction.MODULETYPE].groupby('module_id')
pass_rates = get_pass_rates(grouped)
bias_minus_norm = [model.assessment_biases[history.idx_of_assessment_id(
            assessment_id)] - np.linalg.norm(
        model.assessment_embeddings[history.idx_of_assessment_id(
                assessment_id), :]) for assessment_id in pass_rates]
plt.xlabel('Assessment pass rate')
plt.ylabel('Assessment bias - Assessment embedding norm')
plt.scatter(pass_rates.values(), bias_minus_norm)
plt.show()
    
In [ ]:
    
grouped = df[df['module_type']==datatools.AssessmentInteraction.MODULETYPE].groupby('student_id')
pass_rates = get_pass_rates(grouped)
biases = [model.student_biases[history.idx_of_student_id(
            student_id)] for student_id in pass_rates]
plt.xlabel('Student pass rate')
plt.ylabel('Student bias')
plt.scatter(pass_rates.values(), biases)
plt.show()
    
In [ ]:
    
lesson_norms = np.linalg.norm(model.lesson_embeddings, axis=1)
plt.xlabel('Lesson embedding norm')
plt.ylabel('Frequency (number of lessons)')
plt.hist(lesson_norms, bins=20)
plt.show()
    
In [ ]:
    
prereq_norms = np.linalg.norm(model.prereq_embeddings, axis=1)
plt.xlabel('Prereq embedding norm')
plt.ylabel('Frequency (number of lessons)')
plt.hist(prereq_norms, bins=20)
plt.show()
    
In [ ]:
    
plt.xlabel('Lesson embedding norm')
plt.ylabel('Prereq embedding norm')
plt.scatter(prereq_norms, lesson_norms)
plt.show()
    
In [ ]:
    
timesteps = range(model.student_embeddings.shape[2])
avg_student_norms = np.array(np.linalg.norm(np.mean(model.student_embeddings, axis=0), axis=0))
plt.xlabel('Timestep')
plt.ylabel('Average student embedding norm')
plt.plot(timesteps, avg_student_norms)
plt.show()
    
In [ ]: