Udacity MLND Capstone Project

"Determination of students’ interaction patterns with an intelligent tutoring system and study of their correlation with successful learning"

Step 2 (learning rate determination)



In [1]:

    
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import gzip
import shutil
import time
from scipy.optimize import minimize



In [2]:

    
def hdf_fixed_write_compress(df):
    df.to_hdf('data1-step1.hdf','test',mode='w',complib='blosc')
    return

def hdf_fixed_read_compress():
    df = pd.read_hdf('data.hdf','test')
    return df



In [3]:

    
with gzip.open('data1.hdf.gz', 'rb') as f_in, open('data.hdf', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

!ls -lh data.hdf

data = hdf_fixed_read_compress()
data.head()









    



-rw-rw-r-- 1 dima806 dima806 73M Nov  4 13:27 data.hdf






    Out[3]:







  
    
      
      Anon Student Id
      Session Id
      Duration (sec)
      Student Response Type
      Problem Name
      Problem View
      Attempt At Step
      Outcome
      Day
      x
    
  
  
    
      0
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      102.0
      1
      2218
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      1
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      46.0
      0
      2218
      1.0
      2.0
      0.0
      2004-11-10
      1
    
    
      2
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      70.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      3
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      22.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      4
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      2.0
      1
      3093
      1.0
      2.0
      2.0
      2004-11-10
      0

this step requires about ~45 min to complete:



In [4]:

    
start_time = time.time()

#df = data.head(50000).copy()
df = data.copy()
stud_list = df['Anon Student Id'].unique()
cols=['learning_parameter', \
     'difficulty_parameter', \
     'number of attempts', \
     'number of incorrect attempts']

numbers = []
#stud_data = pd.DataFrame(columns=cols)
stud_info_df = pd.DataFrame()
j = 0

# Taken from http://apmonitor.com/che263/index.php/Main/PythonDataRegression
# and adopted for my purpose

# Inplement C-stat (no need for binning), revelant formula is (5) in 
# W.Cash paper http://adsabs.harvard.edu/doi/10.1086/156922  
# see also B5 of https://heasarc.gsfc.nasa.gov/docs/xanadu/xspec/manual/XSappendixStatistics.html 

# calculate y
def calc_y(x):
    b = x[0]
    d = x[1]
    y = b*(xm)**(-d) # Fitting with powerlaw error function
    return y

# define C-stat
def C_stat(x):

#    xm = np.array(attempts_data_stud_num['Attempt At Step'])
#    ym = 1-np.array(attempts_data_stud_num['Outcome']) # 1-x because we fit the error rate
    # calculate y
    y = calc_y(x)
    # calculate C-stat
    Cstat = 0.0
    for i in range(len(ym)):
        Cstat += 2*(y[i] - ym[i]*np.log(y[i])) # C-stat, see eq.5 in http://adsabs.harvard.edu/doi/10.1086/156922 
    # return result
    return Cstat

for stud_name in stud_list:

    stud_info_df = df[(df['Anon Student Id'] == stud_name) & (df['Outcome'] <= 1)].copy()


    stud_name = j # assign unique numerical ID to each student 
    xm = np.array(stud_info_df['x'])
    ym = np.array(stud_info_df['Outcome']) 
    # initial guesses
    x0 = np.zeros(2)
    x0[0] = 0.7 # initial difficulty_parameter_b
    x0[1] = 0.5 # initial learning_rate_d

    # optimize
    # bounds on variables
    bounds_difficulty_parameter_b = (1e-3, 1.0e+1)
    bounds_learning_parameter_d = (-1.0e+2, 1.0e+2)
    solution = minimize(C_stat, x0, method='SLSQP', bounds=(bounds_difficulty_parameter_b, bounds_learning_parameter_d))

    # method = 'SLSQP' - original
    # other methods (L-BFGS-B, TNC) give the same results,
    # COBYLA is simply too slow ...

    x = solution.x
    y = calc_y(x)

    numbers.append([x[1], x[0], len(xm), sum(ym)])
    
#    print(x[1], x[0], C_stat(x0), C_stat(x))

    print("\r\t>>> Progress\t:{:.4%}".format((j + 1)/len(stud_list)), end='')
    j += 1
stud_data = pd.DataFrame(data=numbers, columns=cols)
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))









    



	>>> Progress	:100.0000%
	>>> Exec. time	:2967.151068687439s



In [8]:

    
stud_data.head()









    Out[8]:







  
    
      
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
    
  
  
    
      0
      0.172964
      0.577597
      303
      146.0
    
    
      1
      -0.011161
      0.623980
      295
      187.0
    
    
      2
      -0.084896
      0.459276
      529
      269.0
    
    
      3
      0.044947
      0.459728
      1286
      556.0
    
    
      4
      0.066242
      0.486793
      821
      369.0



In [9]:

    
stud_data.to_csv('student_learning_final.csv')



In [10]:

    
! ls -l student_learning_final.csv









    



-rw-rw-r-- 1 dima806 dima806 471268 Nov  4 14:34 student_learning_final.csv



In [ ]:

	Anon Student Id	Session Id	Duration (sec)	Student Response Type	Problem Name	Problem View	Attempt At Step	Outcome	Day	x
0	Stu_001d187b1b375fe98b88696b250177f0	647501	102.0	1	2218	1.0	1.0	2.0	2004-11-10	0
1	Stu_001d187b1b375fe98b88696b250177f0	647501	46.0	0	2218	1.0	2.0	0.0	2004-11-10	1
2	Stu_001d187b1b375fe98b88696b250177f0	647792	70.0	1	3093	1.0	1.0	2.0	2004-11-10	0
3	Stu_001d187b1b375fe98b88696b250177f0	647792	22.0	1	3093	1.0	1.0	2.0	2004-11-10	0
4	Stu_001d187b1b375fe98b88696b250177f0	647792	2.0	1	3093	1.0	2.0	2.0	2004-11-10	0

	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts
0	0.172964	0.577597	303	146.0
1	-0.011161	0.623980	295	187.0
2	-0.084896	0.459276	529	269.0
3	0.044947	0.459728	1286	556.0
4	0.066242	0.486793	821	369.0