In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import time
import gzip
import shutil
import seaborn
In [2]:
print("Loading ds92 data:")
ds92_data = pd.read_csv("wpi-assistments/math_2004_2005/ds92_tx_All_Data_172_2016_0504_081852.txt", sep="\t", low_memory=False)
#ds92_data = ds92_data[columns]
print(ds92_data.shape)
print("Loading ds120 data:")
ds120_data = pd.read_csv("wpi-assistments/math_2005_2006/ds120_tx_All_Data_265_2017_0414_065125.txt", sep="\t", low_memory=False)
#ds120_data = ds120_data.copy()[columns]
print(ds120_data.shape)
print("Loading ds339 data:")
ds339_data = pd.read_csv("wpi-assistments/math_2006_2007/ds339_tx_All_Data_1059_2015_0729_215742.txt", sep="\t", low_memory=False)
#ds339_data = ds339_data.copy()[columns]
print(ds339_data.shape)
In [3]:
i = 1
for df in [ds92_data, ds120_data, ds339_data]:
print(">> Processing dataset {}:".format(i))
df['Day'] = df['Time'].apply(lambda x: x.split(" ")[0])
df.drop(['Time'], axis=1, inplace=True)
df['Duration (sec)'] = df['Duration (sec)'].replace({'.': 0}).astype(float)
df['Student Response Type'] = df['Student Response Type'].replace({'ATTEMPT': 0, 'HINT_REQUEST': 1})
df['Outcome'] = df['Outcome'].replace({'CORRECT': 0, 'INCORRECT': 1, 'HINT': 2})
i += 1
In [4]:
ds92_data.describe().dropna(axis=1)
Out[4]:
In [5]:
ds120_data.describe().dropna(axis=1)
Out[5]:
In [6]:
ds339_data.describe().dropna(axis=1)
Out[6]:
As we see, there are several common features present in each dataset. I will use all of them skipping only 'Help Level'
(which gives the number of subsequent hints for hints and NaN for attempts).
In [7]:
data = pd.concat([ds92_data, ds120_data, ds339_data], ignore_index=True)
data.info()
In [8]:
data.head(20)
Out[8]:
In [9]:
data.describe().dropna(axis=1)
Out[9]:
In [10]:
corr1 = data.corr().dropna(how='all', axis=1).dropna(how='all', axis=0)
corr1
Out[10]:
In [11]:
seaborn.heatmap(corr1);
As we see, there are some large (anti)correlations between the following features:
'Outcome'
and 'Student Response Type'
(hints contribute to the largest values in both features); 'Attempt At Step'
and 'Help Level'
(making more steps for attempts generally means making more hints);'Help Level'
and 'Total Num Hints'
(number of subsequent hints clearly correlate with the total number of hints);'Is Last Attempt'
and 'Outcome'
;'Is Last Attempt'
and 'Student Response Type'
(a product of correlation between 'Outcome'
and 'Student Response Type'
and anti-correlation between 'Is Last Attempt'
and 'Outcome'
).
In [12]:
columns = ['Anon Student Id', \
'Session Id', \
'Duration (sec)', \
'Student Response Type', \
'Problem Name', \
'Problem View', \
'Attempt At Step', \
'Outcome', \
'Day']
In [13]:
data = data.copy()[columns]
Note to reviewers: this algorithm is quite slow (~25 minutes), so you may consider adding 'x'
variable (number of attempt) to a substantial subset of ASSISTments dataset (e.g. processing 100,000 rows takes only ~0.5 minutes).
In [14]:
def adding_x(df):
j = 0
start_time = time.time()
df['x'] = 0
df_attempts = df[df['Student Response Type'] == 0].copy()
stud_list = df_attempts['Anon Student Id'].unique()
for student in stud_list:
print("\r\t>>> Progress\t:{:.4%}".format((j + 1)/len(stud_list)), end='')
j += 1
stud = []
stud.append(student)
data_stud = df_attempts[np.in1d(df_attempts['Anon Student Id'], stud)].copy()
for problem in data_stud['Problem Name'].unique():
prob = []
prob.append(problem)
data_prob = data_stud[np.in1d(data_stud['Problem Name'], prob)].copy()
data_stud.loc[data_prob.index,'x'] = range(1,len(data_prob)+1)
df_attempts.loc[data_stud.index,'x'] = data_stud['x']
end_time = time.time()
print("\n\t>>> Exec. time\t:{}s".format(end_time-start_time))
return df_attempts
In [15]:
#data_x = adding_x(data.head(100000).copy())
data_x = adding_x(data.copy())
data['x'] = 0
data.loc[data_x.index,'x'] = data_x['x']
data[data['x'] > 0].shape
Out[15]:
I read from and write to compressed hdf, see performance comparison:
In [16]:
def hdf_fixed_write_compress(df):
df.to_hdf('data.hdf','test',mode='w',complib='blosc')
return
def hdf_fixed_read_compress():
df = pd.read_hdf('data.hdf','test')
return df
In [17]:
hdf_fixed_write_compress(data)
data1 = hdf_fixed_read_compress()
ne = data[data != data1]
ne.dropna(axis=0, how='all', inplace=True)
ne.shape[0]
Out[17]:
This file is too large to upload to Githib:
In [18]:
! ls -lh data.hdf
, so I gzipped it:
In [19]:
with open('data.hdf', 'rb') as f_in, gzip.open('data.hdf.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
The obtained data.hdf.gz
file is smaller than 25M, so I upload it to my Github:
In [20]:
! ls -lh data.hdf.gz
In [21]:
s1 = data[data['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']
s2 = data[data['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']
s1[8] = s1.loc[8:].sum()
for i in range(9, int(s1.index.max()+1)):
try:
s1.drop(i, inplace=True)
except ValueError:
pass
s2[8] = s2.loc[8:].sum()
for i in range(9, int(s2.index.max()+1)):
try:
s2.drop(i, inplace=True)
except ValueError:
pass
In [22]:
# In case of wrong x labelling, simply run this cell 2 times:
fig, ax1 = plt.subplots()
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 8.3
fig_size[1] = 4.7
plt.rcParams["figure.figsize"] = fig_size
plt.xlim(0.5,8.5)
plt.bar(s1.index, s1, width=0.9)
#plt.bar(s2.index, s2, width=0.9)
#plt.legend(['CORRECT', 'INCORRECT'])
plt.xlabel("Attempt number", size=14)
plt.ylabel("Number of attempts", size=14)
ax1.tick_params(axis ='both', which='major', length=0, labelsize =14, color='black')
ax1.tick_params(axis ='both', which='minor', length=0)
labels = [item.get_text() for item in ax1.get_xticklabels()]
labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8+']
#print(labels)
ax2 = ax1.twinx()
ax2.plot(s1.index, s2/s1, 'r-o')
ax2.set_ylabel('Fraction of incorrect attempts', size=14, color='r')
ax2.tick_params('y', colors='r')
ax2.tick_params(axis ='both', which='minor', length=0)
ax2.tick_params(axis ='both', which='major', length=0, labelsize =14, color='red')
ax1.set_xticklabels(labels)
plt.show()
fig.savefig('data-visualisation.png')