Udacity MLND Capstone Project

"Determination of students’ interaction patterns with an intelligent tutoring system and study of their correlation with successful learning"

Step 2 (learning rate determination)


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import gzip
import shutil
import time
from scipy.optimize import minimize

In [2]:
def hdf_fixed_write_compress(df):
    df.to_hdf('data1-step1.hdf','test',mode='w',complib='blosc')
    return

def hdf_fixed_read_compress():
    df = pd.read_hdf('data.hdf','test')
    return df

In [3]:
with gzip.open('data1.hdf.gz', 'rb') as f_in, open('data.hdf', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

!ls -lh data.hdf

data = hdf_fixed_read_compress()
data.head()


-rw-rw-r-- 1 dima806 dima806 73M Nov  4 13:27 data.hdf
Out[3]:
Anon Student Id Session Id Duration (sec) Student Response Type Problem Name Problem View Attempt At Step Outcome Day x
0 Stu_001d187b1b375fe98b88696b250177f0 647501 102.0 1 2218 1.0 1.0 2.0 2004-11-10 0
1 Stu_001d187b1b375fe98b88696b250177f0 647501 46.0 0 2218 1.0 2.0 0.0 2004-11-10 1
2 Stu_001d187b1b375fe98b88696b250177f0 647792 70.0 1 3093 1.0 1.0 2.0 2004-11-10 0
3 Stu_001d187b1b375fe98b88696b250177f0 647792 22.0 1 3093 1.0 1.0 2.0 2004-11-10 0
4 Stu_001d187b1b375fe98b88696b250177f0 647792 2.0 1 3093 1.0 2.0 2.0 2004-11-10 0

this step requires about ~45 min to complete:


In [4]:
start_time = time.time()

#df = data.head(50000).copy()
df = data.copy()
stud_list = df['Anon Student Id'].unique()
cols=['learning_parameter', \
     'difficulty_parameter', \
     'number of attempts', \
     'number of incorrect attempts']

numbers = []
#stud_data = pd.DataFrame(columns=cols)
stud_info_df = pd.DataFrame()
j = 0

# Taken from http://apmonitor.com/che263/index.php/Main/PythonDataRegression
# and adopted for my purpose

# Inplement C-stat (no need for binning), revelant formula is (5) in 
# W.Cash paper http://adsabs.harvard.edu/doi/10.1086/156922  
# see also B5 of https://heasarc.gsfc.nasa.gov/docs/xanadu/xspec/manual/XSappendixStatistics.html 

# calculate y
def calc_y(x):
    b = x[0]
    d = x[1]
    y = b*(xm)**(-d) # Fitting with powerlaw error function
    return y

# define C-stat
def C_stat(x):

#    xm = np.array(attempts_data_stud_num['Attempt At Step'])
#    ym = 1-np.array(attempts_data_stud_num['Outcome']) # 1-x because we fit the error rate
    # calculate y
    y = calc_y(x)
    # calculate C-stat
    Cstat = 0.0
    for i in range(len(ym)):
        Cstat += 2*(y[i] - ym[i]*np.log(y[i])) # C-stat, see eq.5 in http://adsabs.harvard.edu/doi/10.1086/156922 
    # return result
    return Cstat

for stud_name in stud_list:

    stud_info_df = df[(df['Anon Student Id'] == stud_name) & (df['Outcome'] <= 1)].copy()


    stud_name = j # assign unique numerical ID to each student 
    xm = np.array(stud_info_df['x'])
    ym = np.array(stud_info_df['Outcome']) 
    # initial guesses
    x0 = np.zeros(2)
    x0[0] = 0.7 # initial difficulty_parameter_b
    x0[1] = 0.5 # initial learning_rate_d

    # optimize
    # bounds on variables
    bounds_difficulty_parameter_b = (1e-3, 1.0e+1)
    bounds_learning_parameter_d = (-1.0e+2, 1.0e+2)
    solution = minimize(C_stat, x0, method='SLSQP', bounds=(bounds_difficulty_parameter_b, bounds_learning_parameter_d))

    # method = 'SLSQP' - original
    # other methods (L-BFGS-B, TNC) give the same results,
    # COBYLA is simply too slow ...

    x = solution.x
    y = calc_y(x)

    numbers.append([x[1], x[0], len(xm), sum(ym)])
    
#    print(x[1], x[0], C_stat(x0), C_stat(x))

    print("\r\t>>> Progress\t:{:.4%}".format((j + 1)/len(stud_list)), end='')
    j += 1
stud_data = pd.DataFrame(data=numbers, columns=cols)
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))


	>>> Progress	:100.0000%
	>>> Exec. time	:2967.151068687439s

In [8]:
stud_data.head()


Out[8]:
learning_parameter difficulty_parameter number of attempts number of incorrect attempts
0 0.172964 0.577597 303 146.0
1 -0.011161 0.623980 295 187.0
2 -0.084896 0.459276 529 269.0
3 0.044947 0.459728 1286 556.0
4 0.066242 0.486793 821 369.0

In [9]:
stud_data.to_csv('student_learning_final.csv')

In [10]:
! ls -l student_learning_final.csv


-rw-rw-r-- 1 dima806 dima806 471268 Nov  4 14:34 student_learning_final.csv

In [ ]: