In [ ]:
from __future__ import division

import pickle
import os

from matplotlib import pyplot as plt
from scipy import linalg
import numpy as np
import networkx as nx

import seaborn as sns
sns.set_style('whitegrid')

from lentil import datatools

%matplotlib inline

In [ ]:
import logging
logging.getLogger().setLevel(logging.DEBUG)

In [ ]:
history_path = os.path.join('data', 'grockit_history.pkl')

In [ ]:
# load history from file
with open(history_path, 'rb') as f:
    history = pickle.load(f)

In [ ]:
df = history.data
idx_of_module_id = {k: i for i, k in enumerate(df['module_id'].unique())}
num_modules = len(idx_of_module_id)
print "Number of unique modules = %d" % num_modules

In [ ]:
# compute adjacency matrix of flow graph

# sometimes a student history contains a module id
# multiple times (for assessment and lesson interactions)
IGNORE_REPEATED_MODULE_IDS = True

X = np.zeros((num_modules, num_modules))
grouped = df.groupby('student_id')['module_id']
for student_id, group in grouped:
    module_idxes = group.map(idx_of_module_id).values
    
    if IGNORE_REPEATED_MODULE_IDS:
        filtered_module_idxes = []
        module_idxes_seen = set()
        for module_idx in module_idxes:
            if module_idx in module_idxes_seen:
                continue
            filtered_module_idxes.append(module_idx)
            module_idxes_seen |= {module_idx}
    
    # okay because module transitions are never repeated in this dataset
    # if that's not true, then use np.add.at
    # http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.at.html
    X[module_idxes[:-1], module_idxes[1:]] += 1

In [ ]:
# is the Markov chain ergodic?
# i.e., is the flow graph strongly connected?
G = nx.from_numpy_matrix(X, create_using=nx.DiGraph())

In [ ]:
sc = nx.strongly_connected_components(G)

In [ ]:
print "Sizes of strongly connected components:"
print [len(x) for x in sc]

In [ ]:
# compute transition probability matrix of Markov chain
P = X / X.sum(axis=1)[:, np.newaxis]

In [ ]:
# estimate stationary distribution of Markov chain
stationary_distrn = np.diag(np.linalg.matrix_power(P, 2**15))

In [ ]:
prev = P
N = 15
diffs = [None] * N
for i in xrange(N):
    nP = np.dot(prev, prev)
    diffs[i] = np.linalg.norm(np.diag(nP) - np.diag(prev))
    prev = nP

In [ ]:
plt.xlabel('n')
plt.ylabel('||diag(P^n)-diag(P^(n-1))||')
plt.plot(2**np.arange(0, N, 1), diffs, '-s')
plt.yscale('log')
plt.xscale('log')
plt.show()

In [ ]:
entropy = -np.dot(stationary_distrn, np.nansum(P*np.log(P), axis=1))
print "Entropy = %f" % entropy

In [ ]:
output_path = os.path.join('results', 'entropy', 'grockit_entropy.pkl')

In [ ]:
with open(output_path, 'wb') as f:
    pickle.dump(entropy, f, pickle.HIGHEST_PROTOCOL)

Compare path entropy to gains from lesson prereq model


In [ ]:
data_sets = ['assistments_2009_2010', 'algebra_2006_2007', 
             'algebra_2005_2006', 'bridge_to_algebra_2006_2007', 'grockit']

In [ ]:
entropy_file_of_data_set = {k: os.path.join(
        'results', 'entropy', '%s_entropy.pkl' % k) for k in data_sets}

In [ ]:
results_file_of_data_set = {k: os.path.join(
        'results', 'last', '%s_results_lesion.pkl' % k) for k in data_sets}

In [ ]:
entropies_of_models, results_of_models = [], []
for ds in data_sets:
    with open(entropy_file_of_data_set[ds], 'rb') as f:
        entropies_of_models.append(pickle.load(f))
    with open(results_file_of_data_set[ds], 'rb') as f:
        results_of_models.append(pickle.load(f))

In [ ]:
def make_plot(eps=1e-2):
    gains_of_models = [compute_gain_from_prereq_model(results) for results in results_of_models]

    plt.xlabel('Entropy of student paths')
    plt.ylabel(name_of_gain_metric)
    plt.scatter(entropies_of_models, gains_of_models)
    for e, g, ds in zip(entropies_of_models, gains_of_models, data_sets):
        plt.annotate(ds, (e+eps, g+eps))
    plt.show()

In [ ]:
name_of_gain_metric = 'Relative AUC gain from prereq model'

def compute_gain_from_prereq_model(res):
    a = res.validation_auc_mean('d=2, without prereqs and bias')
    b = res.validation_auc_mean('d=2, without prereqs, with bias')
    c = res.validation_auc_mean('d=2, with prereqs, without bias')
    d = res.validation_auc_mean('d=2, with prereqs and bias')
    return np.mean([(c-a)/a, (d-b)/b])

make_plot()

In [ ]:
name_of_gain_metric = 'Relative AUC gain from prereq model (without bias)'

def compute_gain_from_prereq_model(res):
    a = res.validation_auc_mean('d=2, without prereqs and bias')
    c = res.validation_auc_mean('d=2, with prereqs, without bias')
    return (c-a)/a

make_plot()

In [ ]:
name_of_gain_metric = 'Relative AUC gain from prereq model (with bias)'

def compute_gain_from_prereq_model(res):
    b = res.validation_auc_mean('d=2, without prereqs, with bias')
    d = res.validation_auc_mean('d=2, with prereqs and bias')
    return (d-b)/b

make_plot(eps=1e-3)

In [ ]:
name_of_gain_metric = 'AUC gain from prereq model'

def compute_gain_from_prereq_model(res):
    a = res.validation_auc_mean('d=2, without prereqs and bias')
    b = res.validation_auc_mean('d=2, without prereqs, with bias')
    c = res.validation_auc_mean('d=2, with prereqs, without bias')
    d = res.validation_auc_mean('d=2, with prereqs and bias')
    return np.mean([c-a, d-b])

make_plot()

In [ ]: