In [ ]:
from __future__ import division

import pickle
import os
import random

from matplotlib import pyplot as plt
from sklearn import metrics
import numpy as np

import seaborn as sns
sns.set_style('whitegrid')

from lentil import datatools
from lentil import models
from lentil import est

%matplotlib inline

In [ ]:
import logging
logging.getLogger().setLevel(logging.DEBUG)

In [ ]:
history_path = os.path.join('data', 'assistments_2009_2010_history.pkl')

In [ ]:
with open(history_path, 'rb') as f:
    history = pickle.load(f)

In [ ]:
def build_embedding(
    embedding_kwargs,
    estimator,
    history,
    filtered_history,
    split_history=None):
    
    model = models.EmbeddingModel(history, **embedding_kwargs)
    
    estimator.filtered_history = filtered_history
    if split_history is not None:
        estimator.split_history = split_history
    
    model.fit(estimator)
    
    return model

embedding_kwargs = {
    'embedding_dimension' : 2,
    'using_lessons' : True,
    'using_prereqs' : True,
    'using_bias' : True,
    'learning_update_variance_constant' : 0.5
}

estimator = est.EmbeddingMAPEstimator(
    regularization_constant=1e-3,
    using_scipy=True,
    verify_gradient=False,
    debug_mode_on=True,
    ftol=1e-3)

In [ ]:
def compute_auc(y_trues, probas_preds, timesteps):
    """
    Compute AUC across timesteps, given true labels and predicted likelihoods
    
    :param dict[str, list[list[int]]] y_trues: 
        A dictionary mapping student_id to a list 
        (indexed by timestep) of lists of true labels (-1/1 for fail/pass)
        
    :param dict[str, list[list[float]]] probas_preds:
        A dictionary mapping student_id to a list
        (indexed by timestep) of lists of pass likelihoods
        
    :param np.array timesteps: A list of timesteps
    :rtype: list[float]
    :return: A list of AUCs (one for each timestep)
    """
    def get_auc(i):
        y_trues_of_timestep = [o for student_id, y_trues_of_student in y_trues.iteritems() for o in y_trues_of_student[i]]
        probas_preds_of_timestep = [p for student_id, y_trues_of_student in y_trues.iteritems() for p in probas_preds[student_id][i]]
        if y_trues_of_timestep == probas_preds_of_timestep == []:
            return None
        fpr, tpr, _ = metrics.roc_curve(y_trues_of_timestep, probas_preds_of_timestep)

        return metrics.auc(fpr, tpr)
    return [get_auc(i) for i in xrange(len(timesteps))]

In [ ]:
df = history.data
duration = history.duration()
num_students = history.num_students()

In [ ]:
# 80-20 training-validation split
num_left_out_students = int(0.2 * num_students)

How does validation AUC vary with the length of student histories?


In [ ]:
left_out_student_ids = {history.id_of_student_idx(
        student_idx) for student_idx in random.sample(
        range(num_students), num_left_out_students)}

In [ ]:
y_trues_for_length_analysis = {k:[] for k in left_out_student_ids}
probas_preds_for_length_analysis = {k:[] for k in left_out_student_ids}

In [ ]:
START_LENGTH = 3
END_LENGTH = duration
NUM_LENGTHS = 25

grouped = df.groupby('student_id')
left_out_ixns = df['student_id'].isin(left_out_student_ids)
left_in_ixns = ~left_out_ixns
lengths = np.arange(START_LENGTH, END_LENGTH, (END_LENGTH - START_LENGTH) // NUM_LENGTHS)

for i, t in enumerate(lengths):
    print '%d of %d' % (i, len(lengths))
    
    filtered_history = df[(left_in_ixns) | (
            (left_out_ixns) & (df['timestep']<=t))]
    split_history = history.split_interactions_by_type(
            filtered_history=filtered_history)
    
    model = build_embedding(
        embedding_kwargs,
        estimator,
        history,
        filtered_history,
        split_history=split_history)
    
    for student_id in left_out_student_ids:
        test_ixns = grouped.get_group(student_id)
        test_ixns = test_ixns[(test_ixns['timestep']==t+1) & (
                test_ixns['module_type']==datatools.AssessmentInteraction.MODULETYPE)]
        if len(test_ixns)==0:
            y_trues_for_length_analysis[student_id].append([])
            probas_preds_for_length_analysis[student_id].append([])
            continue
        
        y_trues_for_length_analysis[student_id].append(list(test_ixns['outcome'].apply(
            lambda outcome: 1 if outcome else -1)))
        probas_preds_for_length_analysis[student_id].append(list(test_ixns.apply(
            model.assessment_pass_likelihood, axis=1)))

In [ ]:
aucs = compute_auc(
    y_trues_for_length_analysis, 
    probas_preds_for_length_analysis, 
    lengths)

In [ ]:
plt.title('Sensitivity to length of student history')
plt.xlabel('Length of student history (number of timesteps)')
plt.ylabel('Area under ROC Curve')
plt.plot(lengths, aucs)
plt.show()

How does validation AUC vary with the depth of student histories?


In [ ]:
# we will train on interactions from t=T-depth to t=T
# and validate on interactions at T=201 (more specifically, 
# the smallest T such that T>200 and the student has assessment interactions at T)
T = 200
students_with_long_histories = df[df['timestep'] > T]['student_id'].unique()
left_out_student_ids = random.sample(
    students_with_long_histories,
    num_left_out_students) if num_left_out_students < len(students_with_long_histories) else students_with_long_histories

In [ ]:
y_trues_for_depth_analysis = {k:[] for k in left_out_student_ids}
probas_preds_for_depth_analysis = {k:[] for k in left_out_student_ids}

In [ ]:
truncations = {}
grouped = df.groupby('student_id')
for student_id in left_out_student_ids:
        test_ixns = grouped.get_group(student_id)
        test_ixns = test_ixns[(test_ixns['timestep']>=T+1) & (
                test_ixns['module_type']==datatools.AssessmentInteraction.MODULETYPE)]
        try:
            truncations[student_id] = min(test_ixns['timestep'])-1
        except ValueError:
            left_out_student_ids.remove(student_id)

In [ ]:
START_DEPTH = 3
END_DEPTH = T
NUM_DEPTHS = 25

left_out_ixns = df['student_id'].isin(left_out_student_ids)
grouped = df.groupby('student_id')
depths = np.arange(START_DEPTH, END_DEPTH, (END_DEPTH - START_DEPTH) // NUM_DEPTHS)
left_in_ixns = ~left_out_ixns

for i, t in enumerate(depths):
    print '%d of %d' % (i, len(depths))
    
    truncations_series = df['student_id'].map(truncations, na_action=None)
    
    filtered_history = df[(left_in_ixns) | (left_out_ixns & (
                df['timestep']<=truncations_series) & (
                df['timestep']>=truncations_series-t))]
    split_history = history.split_interactions_by_type(
            filtered_history=filtered_history)
    
    model = build_embedding(
        embedding_kwargs,
        estimator,
        history,
        filtered_history,
        split_history=split_history)
    
    for student_id in left_out_student_ids:
        test_ixns = grouped.get_group(student_id)
        test_ixns = test_ixns[(
                test_ixns['timestep']==truncations[student_id]+1) & (
                test_ixns['module_type']==datatools.AssessmentInteraction.MODULETYPE)]
        if len(test_ixns)==0:
            y_trues_for_depth_analysis[student_id].append([])
            probas_preds_for_depth_analysis[student_id].append([])
            continue
        
        y_trues_for_depth_analysis[student_id].append(list(test_ixns['outcome'].apply(
            lambda outcome: 1 if outcome else -1)))
        probas_preds_for_depth_analysis[student_id].append(list(test_ixns.apply(
            model.assessment_pass_likelihood, axis=1)))

In [ ]:
aucs = compute_auc(
    y_trues_for_depth_analysis, 
    probas_preds_for_depth_analysis, 
    depths)

In [ ]:
plt.title('Sensitivity to depth of student history')
plt.xlabel('Depth of student history (number of lesson interactions)')
plt.ylabel('Area under ROC Curve')
plt.plot(depths, aucs)
plt.show()

How does validation AUC vary with the number of full student histories?


In [ ]:
left_out_student_ids = {history.id_of_student_idx(
        student_idx) for student_idx in random.sample(
        range(num_students), num_left_out_students)}

In [ ]:
y_trues_for_tset_size_analysis = {k:[] for k in left_out_student_ids}
probas_preds_for_tset_size_analysis = {k:[] for k in left_out_student_ids}

In [ ]:
# training set size = fraction of full student histories
START_TRAINSET_SIZE = 0.1
END_TRAINSET_SIZE = 1.0
NUM_TRAINSET_SIZES = 25
tset_sizes = np.arange(START_TRAINSET_SIZE, END_TRAINSET_SIZE, (END_TRAINSET_SIZE - START_TRAINSET_SIZE) / NUM_TRAINSET_SIZES)

not_in_beginning = df['timestep'] > 2
is_assessment_ixn = df['module_type'] == datatools.AssessmentInteraction.MODULETYPE
left_out = df['student_id'].isin(left_out_student_ids)
grouped = df[not_in_beginning & is_assessment_ixn & left_out].groupby('student_id')
        
student_cut_loc = grouped.timestep.max() - 1
student_cut_loc.name = 'student_cut_loc'
truncations = df.join(
    student_cut_loc, on='student_id')['student_cut_loc'].fillna(
    np.nan, inplace=False)

left_in_students = [x for x in history.iter_students() if x not in left_out_student_ids]

In [ ]:
for i, t in enumerate(tset_sizes):
    print '%d of %d' % (i, len(tset_sizes))
    
    tset = set(left_in_students[:int(t*len(left_in_students))])
    left_in = df['student_id'].isin(tset)
    
    filtered_history = df[(left_in) | (
            left_out & (df['timestep']<=truncations))]
    
    train_assessments = set(
        filtered_history[filtered_history['module_type']==datatools.AssessmentInteraction.MODULETYPE]['module_id'].unique())
    
    split_history = history.split_interactions_by_type(
            filtered_history=filtered_history)
    
    model = build_embedding(
        embedding_kwargs,
        estimator,
        history,
        filtered_history,
        split_history=split_history)
    
    for student_id in left_out_student_ids:
        test_ixns = grouped.get_group(student_id)
        test_t = student_cut_loc.ix[student_id]+1
        test_ixns = test_ixns[(test_ixns['timestep']==test_t) & (
                test_ixns['module_id'].isin(train_assessments))]
        if len(test_ixns)==0:
            y_trues_for_tset_size_analysis[student_id].append([])
            probas_preds_for_tset_size_analysis[student_id].append([])
            continue
        
        y_trues_for_tset_size_analysis[student_id].append(list(test_ixns['outcome'].apply(
            lambda outcome: 1 if outcome else -1)))
        probas_preds_for_tset_size_analysis[student_id].append(list(test_ixns.apply(
            model.assessment_pass_likelihood, axis=1)))

In [ ]:
aucs = compute_auc(
    y_trues_for_tset_size_analysis, 
    probas_preds_for_tset_size_analysis, 
    tset_sizes)

In [ ]:
plt.title('Sensitivity to number of full student histories')
plt.xlabel('Number of full student histories in training set')
plt.ylabel('Area under ROC Curve')
plt.plot(tset_sizes * len(left_in_students), aucs)
plt.show()

How does validation AUC vary with the level of noise in the training outcomes?


In [ ]:
left_out_student_ids = {history.id_of_student_idx(
        student_idx) for student_idx in random.sample(
        range(num_students), num_left_out_students)}

In [ ]:
y_trues_for_noise_analysis = {k:[] for k in left_out_student_ids}
probas_preds_for_noise_analysis = {k:[] for k in left_out_student_ids}

In [ ]:
delta = 0.05
max_noise = 1.
noise_levels = np.arange(0, max_noise + delta, delta)

not_in_beginning = df['timestep'] > 2
is_assessment_ixn = df['module_type'] == datatools.AssessmentInteraction.MODULETYPE
left_out = df['student_id'].isin(left_out_student_ids)
grouped = df[not_in_beginning & is_assessment_ixn & left_out].groupby('student_id')
        
student_cut_loc = grouped.timestep.max() - 1
student_cut_loc.name = 'student_cut_loc'
truncations = df.join(
    student_cut_loc, on='student_id')['student_cut_loc'].fillna(
    np.nan, inplace=False)

left_in_students = [x for x in history.iter_students() if x not in left_out_student_ids]
left_in = df['student_id'].isin(left_in_students)

train_assessments = set(
        filtered_history[filtered_history['module_type']==datatools.AssessmentInteraction.MODULETYPE]['module_id'].unique())

In [ ]:
for i, p in enumerate(noise_levels):
    print '%d of %d' % (i, len(noise_levels))
    
    filtered_history = df[(left_in) | (
            left_out & (df['timestep']<=truncations))]
    
    # add noise
    filtered_history['outcome'] = filtered_history['outcome'].apply(
        lambda x: x if x is None else (not x if random.random() < p else x))
    
    split_history = history.split_interactions_by_type(
            filtered_history=filtered_history)

    model = build_embedding(
        embedding_kwargs,
        estimator,
        history,
        filtered_history,
        split_history=split_history)
    
    for student_id in left_out_student_ids:
        test_ixns = grouped.get_group(student_id)
        test_t = student_cut_loc.ix[student_id]+1
        test_ixns = test_ixns[(test_ixns['timestep']==test_t) & (
                test_ixns['module_id'].isin(train_assessments))]
        if len(test_ixns)==0:
            y_trues_for_noise_analysis[student_id].append([])
            probas_preds_for_noise_analysis[student_id].append([])
            continue
        
        y_trues_for_noise_analysis[student_id].append(list(test_ixns['outcome'].apply(
            lambda outcome: 1 if outcome else -1)))
        probas_preds_for_noise_analysis[student_id].append(list(test_ixns.apply(
            model.assessment_pass_likelihood, axis=1)))

In [ ]:
aucs = compute_auc(
    y_trues_for_noise_analysis, 
    probas_preds_for_noise_analysis, 
    noise_levels)

In [ ]:
plt.title('Sensitivity to noise in outcomes')
plt.xlabel('Probability of flipping an outcome')
plt.ylabel('Area under ROC Curve')
plt.plot(noise_levels, aucs)
plt.show()

In [ ]: