In [ ]:
from __future__ import division
import pickle
import os
from matplotlib import pyplot as plt
from scipy import linalg
import numpy as np
import networkx as nx
import seaborn as sns
sns.set_style('whitegrid')
from lentil import datatools
%matplotlib inline
In [ ]:
import logging
logging.getLogger().setLevel(logging.DEBUG)
In [ ]:
history_path = os.path.join('data', 'grockit_history.pkl')
In [ ]:
# load history from file
with open(history_path, 'rb') as f:
history = pickle.load(f)
In [ ]:
df = history.data
idx_of_module_id = {k: i for i, k in enumerate(df['module_id'].unique())}
num_modules = len(idx_of_module_id)
print "Number of unique modules = %d" % num_modules
In [ ]:
# compute adjacency matrix of flow graph
# sometimes a student history contains a module id
# multiple times (for assessment and lesson interactions)
IGNORE_REPEATED_MODULE_IDS = True
X = np.zeros((num_modules, num_modules))
grouped = df.groupby('student_id')['module_id']
for student_id, group in grouped:
module_idxes = group.map(idx_of_module_id).values
if IGNORE_REPEATED_MODULE_IDS:
filtered_module_idxes = []
module_idxes_seen = set()
for module_idx in module_idxes:
if module_idx in module_idxes_seen:
continue
filtered_module_idxes.append(module_idx)
module_idxes_seen |= {module_idx}
# okay because module transitions are never repeated in this dataset
# if that's not true, then use np.add.at
# http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.at.html
X[module_idxes[:-1], module_idxes[1:]] += 1
In [ ]:
# is the Markov chain ergodic?
# i.e., is the flow graph strongly connected?
G = nx.from_numpy_matrix(X, create_using=nx.DiGraph())
In [ ]:
sc = nx.strongly_connected_components(G)
In [ ]:
print "Sizes of strongly connected components:"
print [len(x) for x in sc]
In [ ]:
# compute transition probability matrix of Markov chain
P = X / X.sum(axis=1)[:, np.newaxis]
In [ ]:
# estimate stationary distribution of Markov chain
stationary_distrn = np.diag(np.linalg.matrix_power(P, 2**15))
In [ ]:
prev = P
N = 15
diffs = [None] * N
for i in xrange(N):
nP = np.dot(prev, prev)
diffs[i] = np.linalg.norm(np.diag(nP) - np.diag(prev))
prev = nP
In [ ]:
plt.xlabel('n')
plt.ylabel('||diag(P^n)-diag(P^(n-1))||')
plt.plot(2**np.arange(0, N, 1), diffs, '-s')
plt.yscale('log')
plt.xscale('log')
plt.show()
In [ ]:
entropy = -np.dot(stationary_distrn, np.nansum(P*np.log(P), axis=1))
print "Entropy = %f" % entropy
In [ ]:
output_path = os.path.join('results', 'entropy', 'grockit_entropy.pkl')
In [ ]:
with open(output_path, 'wb') as f:
pickle.dump(entropy, f, pickle.HIGHEST_PROTOCOL)
Compare path entropy to gains from lesson prereq model
In [ ]:
data_sets = ['assistments_2009_2010', 'algebra_2006_2007',
'algebra_2005_2006', 'bridge_to_algebra_2006_2007', 'grockit']
In [ ]:
entropy_file_of_data_set = {k: os.path.join(
'results', 'entropy', '%s_entropy.pkl' % k) for k in data_sets}
In [ ]:
results_file_of_data_set = {k: os.path.join(
'results', 'last', '%s_results_lesion.pkl' % k) for k in data_sets}
In [ ]:
entropies_of_models, results_of_models = [], []
for ds in data_sets:
with open(entropy_file_of_data_set[ds], 'rb') as f:
entropies_of_models.append(pickle.load(f))
with open(results_file_of_data_set[ds], 'rb') as f:
results_of_models.append(pickle.load(f))
In [ ]:
def make_plot(eps=1e-2):
gains_of_models = [compute_gain_from_prereq_model(results) for results in results_of_models]
plt.xlabel('Entropy of student paths')
plt.ylabel(name_of_gain_metric)
plt.scatter(entropies_of_models, gains_of_models)
for e, g, ds in zip(entropies_of_models, gains_of_models, data_sets):
plt.annotate(ds, (e+eps, g+eps))
plt.show()
In [ ]:
name_of_gain_metric = 'Relative AUC gain from prereq model'
def compute_gain_from_prereq_model(res):
a = res.validation_auc_mean('d=2, without prereqs and bias')
b = res.validation_auc_mean('d=2, without prereqs, with bias')
c = res.validation_auc_mean('d=2, with prereqs, without bias')
d = res.validation_auc_mean('d=2, with prereqs and bias')
return np.mean([(c-a)/a, (d-b)/b])
make_plot()
In [ ]:
name_of_gain_metric = 'Relative AUC gain from prereq model (without bias)'
def compute_gain_from_prereq_model(res):
a = res.validation_auc_mean('d=2, without prereqs and bias')
c = res.validation_auc_mean('d=2, with prereqs, without bias')
return (c-a)/a
make_plot()
In [ ]:
name_of_gain_metric = 'Relative AUC gain from prereq model (with bias)'
def compute_gain_from_prereq_model(res):
b = res.validation_auc_mean('d=2, without prereqs, with bias')
d = res.validation_auc_mean('d=2, with prereqs and bias')
return (d-b)/b
make_plot(eps=1e-3)
In [ ]:
name_of_gain_metric = 'AUC gain from prereq model'
def compute_gain_from_prereq_model(res):
a = res.validation_auc_mean('d=2, without prereqs and bias')
b = res.validation_auc_mean('d=2, without prereqs, with bias')
c = res.validation_auc_mean('d=2, with prereqs, without bias')
d = res.validation_auc_mean('d=2, with prereqs and bias')
return np.mean([c-a, d-b])
make_plot()
In [ ]: