In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import gzip
import shutil
import time
from scipy.optimize import minimize
In [2]:
def hdf_fixed_write_compress(df):
df.to_hdf('data1-step1.hdf','test',mode='w',complib='blosc')
return
def hdf_fixed_read_compress():
df = pd.read_hdf('data.hdf','test')
return df
In [3]:
with gzip.open('data1.hdf.gz', 'rb') as f_in, open('data.hdf', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
!ls -lh data.hdf
data = hdf_fixed_read_compress()
data.head()
Out[3]:
this step requires about ~45 min to complete:
In [4]:
start_time = time.time()
#df = data.head(50000).copy()
df = data.copy()
stud_list = df['Anon Student Id'].unique()
cols=['learning_parameter', \
'difficulty_parameter', \
'number of attempts', \
'number of incorrect attempts']
numbers = []
#stud_data = pd.DataFrame(columns=cols)
stud_info_df = pd.DataFrame()
j = 0
# Taken from http://apmonitor.com/che263/index.php/Main/PythonDataRegression
# and adopted for my purpose
# Inplement C-stat (no need for binning), revelant formula is (5) in
# W.Cash paper http://adsabs.harvard.edu/doi/10.1086/156922
# see also B5 of https://heasarc.gsfc.nasa.gov/docs/xanadu/xspec/manual/XSappendixStatistics.html
# calculate y
def calc_y(x):
b = x[0]
d = x[1]
y = b*(xm)**(-d) # Fitting with powerlaw error function
return y
# define C-stat
def C_stat(x):
# xm = np.array(attempts_data_stud_num['Attempt At Step'])
# ym = 1-np.array(attempts_data_stud_num['Outcome']) # 1-x because we fit the error rate
# calculate y
y = calc_y(x)
# calculate C-stat
Cstat = 0.0
for i in range(len(ym)):
Cstat += 2*(y[i] - ym[i]*np.log(y[i])) # C-stat, see eq.5 in http://adsabs.harvard.edu/doi/10.1086/156922
# return result
return Cstat
for stud_name in stud_list:
stud_info_df = df[(df['Anon Student Id'] == stud_name) & (df['Outcome'] <= 1)].copy()
stud_name = j # assign unique numerical ID to each student
xm = np.array(stud_info_df['x'])
ym = np.array(stud_info_df['Outcome'])
# initial guesses
x0 = np.zeros(2)
x0[0] = 0.7 # initial difficulty_parameter_b
x0[1] = 0.5 # initial learning_rate_d
# optimize
# bounds on variables
bounds_difficulty_parameter_b = (1e-3, 1.0e+1)
bounds_learning_parameter_d = (-1.0e+2, 1.0e+2)
solution = minimize(C_stat, x0, method='SLSQP', bounds=(bounds_difficulty_parameter_b, bounds_learning_parameter_d))
# method = 'SLSQP' - original
# other methods (L-BFGS-B, TNC) give the same results,
# COBYLA is simply too slow ...
x = solution.x
y = calc_y(x)
numbers.append([x[1], x[0], len(xm), sum(ym)])
# print(x[1], x[0], C_stat(x0), C_stat(x))
print("\r\t>>> Progress\t:{:.4%}".format((j + 1)/len(stud_list)), end='')
j += 1
stud_data = pd.DataFrame(data=numbers, columns=cols)
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
In [8]:
stud_data.head()
Out[8]:
In [9]:
stud_data.to_csv('student_learning_final.csv')
In [10]:
! ls -l student_learning_final.csv
In [ ]: