Udacity MLND Capstone Project

"Determination of students’ interaction patterns with an intelligent tutoring system and study of their correlation with successful learning"

Step 3 (comparison of learning rates between clusters)



In [1]:

    
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import time
from scipy import stats
from scipy.optimize import minimize



In [2]:

    
stud_learning = pd.read_csv('student_learning_final.csv')
stud_learning.drop(['Unnamed: 0'], axis=1, inplace=True)
cluster_index = pd.read_csv("cluster_index.csv", header=None)
stud_learning['cluster_index'] = cluster_index[1]
stud_learning['frac_incorrect_atts'] = stud_learning['number of incorrect attempts'] / stud_learning['number of attempts']
stud_learning.head()









    Out[2]:







  
    
      
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      cluster_index
      frac_incorrect_atts
    
  
  
    
      0
      0.172964
      0.577597
      303
      146.0
      1
      0.481848
    
    
      1
      -0.011161
      0.623980
      295
      187.0
      5
      0.633898
    
    
      2
      -0.084896
      0.459276
      529
      269.0
      6
      0.508507
    
    
      3
      0.044947
      0.459728
      1286
      556.0
      2
      0.432348
    
    
      4
      0.066242
      0.486793
      821
      369.0
      1
      0.449452



In [3]:

    
stud_learning.columns









    Out[3]:





Index(['learning_parameter', 'difficulty_parameter', 'number of attempts',
       'number of incorrect attempts', 'cluster_index', 'frac_incorrect_atts'],
      dtype='object')



In [4]:

    
stud_data = pd.read_hdf('stud_data.hdf','test')
stud_data.head()









    Out[4]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
    
  
  
    
      0
      89
      9
      79
      303
      213
      0.518152
      0.184818
      0.286385
      9577.000
      3660.999
      1.101266
      3.835443
    
    
      1
      86
      7
      59
      295
      111
      0.366102
      0.071186
      0.063063
      10409.000
      2570.000
      1.610169
      5.000000
    
    
      2
      181
      10
      150
      529
      180
      0.491493
      0.102079
      0.077778
      14850.000
      2295.000
      1.240000
      3.526667
    
    
      3
      457
      14
      215
      1288
      687
      0.566770
      0.148292
      0.066958
      25290.001
      7743.000
      2.148837
      5.990698
    
    
      4
      267
      13
      166
      821
      602
      0.550548
      0.068210
      0.267442
      20504.667
      5347.334
      1.660606
      4.945783



In [5]:

    
stud_data = stud_data.join(stud_learning)
stud_data.head()









    Out[5]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      cluster_index
      frac_incorrect_atts
    
  
  
    
      0
      89
      9
      79
      303
      213
      0.518152
      0.184818
      0.286385
      9577.000
      3660.999
      1.101266
      3.835443
      0.172964
      0.577597
      303
      146.0
      1
      0.481848
    
    
      1
      86
      7
      59
      295
      111
      0.366102
      0.071186
      0.063063
      10409.000
      2570.000
      1.610169
      5.000000
      -0.011161
      0.623980
      295
      187.0
      5
      0.633898
    
    
      2
      181
      10
      150
      529
      180
      0.491493
      0.102079
      0.077778
      14850.000
      2295.000
      1.240000
      3.526667
      -0.084896
      0.459276
      529
      269.0
      6
      0.508507
    
    
      3
      457
      14
      215
      1288
      687
      0.566770
      0.148292
      0.066958
      25290.001
      7743.000
      2.148837
      5.990698
      0.044947
      0.459728
      1286
      556.0
      2
      0.432348
    
    
      4
      267
      13
      166
      821
      602
      0.550548
      0.068210
      0.267442
      20504.667
      5347.334
      1.660606
      4.945783
      0.066242
      0.486793
      821
      369.0
      1
      0.449452

Determine what clusters more successful in learning in terms of fraction of correct attempts:



In [6]:

    
stud_data_sum = stud_data.groupby('cluster_index').agg(np.sum).copy()
stud_data_sum['frac_incorrect_atts'] = stud_data_sum['number of incorrect attempts'] / stud_data_sum['number of attempts']
stud_data_sum









    Out[6]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      frac_incorrect_atts
    
    
      cluster_index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      157381
      9790
      119787
      463356
      319334
      746.328247
      143.134751
      320.395430
      1.209619e+07
      3310766.636
      1700.710013
      5073.330503
      533.615744
      590.950677
      463202
      197852.0
      0.427140
    
    
      2
      22030
      1535
      17766
      72336
      33803
      203.188547
      91.172770
      13.541033
      1.645484e+06
      459433.500
      466.191313
      1403.399037
      426.118912
      218.872902
      71890
      36290.0
      0.504799
    
    
      3
      16010
      1876
      14923
      33757
      2842
      646.793540
      12.846483
      2.963157
      1.822630e+06
      45694.000
      1209.648207
      3165.984896
      2763.923125
      546.419214
      33752
      12311.0
      0.364749
    
    
      4
      36928
      5409
      32754
      125483
      52632
      1256.070663
      41.842531
      26.817449
      5.753900e+06
      1572946.500
      2775.185284
      9999.657523
      1684.809382
      1333.332278
      125470
      63394.0
      0.505252
    
    
      5
      121977
      12125
      108264
      372790
      134056
      1088.841940
      61.644813
      46.097252
      1.566643e+07
      3317271.000
      2253.456633
      6800.545131
      138.585166
      891.014094
      372649
      171703.0
      0.460763
    
    
      6
      329574
      21053
      265370
      848789
      258225
      1112.744922
      68.669698
      62.199569
      3.145390e+07
      4868593.500
      2292.982438
      5939.409007
      -11.449252
      711.539726
      847416
      334224.0
      0.394404

Interestingly, group 3 has the smallest fraction of incorrect attempts (~36.5%). Also, not surprisingly, 'frac_incorrect_atts' in group 1 (with large 'frac_1s_hints') is significantly (p-value = 1.75e-8) smaller than in group 2 (with small 'frac_1s_hints' and large 'frac_3s_atts'):



In [7]:

    
arr1 = np.array(stud_data[stud_data['cluster_index'] == 1]['frac_incorrect_atts'])
arr2 = np.array(stud_data[stud_data['cluster_index'] == 2]['frac_incorrect_atts'])
arr1 = arr1[~np.isnan(arr1)]
arr2 = arr2[~np.isnan(arr2)]
stats.ttest_ind(arr1,arr2, equal_var = False)









    Out[7]:





Ttest_indResult(statistic=-5.729328888101044, pvalue=1.7528523291077249e-08)

However, the difference of 'frac_incorrect_atts' between students with "gaming" and non-gaming behaviour is not significant (p-value = 0.83):



In [8]:

    
arr_gam = np.array(stud_data[stud_data['cluster_index'] <= 2]['frac_incorrect_atts'])
arr_nongam = np.array(stud_data[stud_data['cluster_index'] > 2]['frac_incorrect_atts'])
arr_gam = arr_gam[~np.isnan(arr_gam)]
arr_nongam = arr_nongam[~np.isnan(arr_nongam)]
stats.ttest_ind(arr_gam,arr_nongam, equal_var = False)









    Out[8]:





Ttest_indResult(statistic=0.20874210756084033, pvalue=0.83466558756318543)

Notably, group 5 (students with medium 'num_sess' and 'num_probs') has significantly smaller 'frac_incorrect_atts' than group 4 (students with small 'num_sess' and 'num_probs') but significantly smaller 'frac_incorrect_atts' than in group 6 (students with large 'num_sess' and 'num_probs'):



In [9]:

    
arr4 = np.array(stud_data[stud_data['cluster_index'] == 4]['frac_incorrect_atts'])
arr5 = np.array(stud_data[stud_data['cluster_index'] == 5]['frac_incorrect_atts'])
arr6 = np.array(stud_data[stud_data['cluster_index'] == 6]['frac_incorrect_atts'])
arr4 = arr4[~np.isnan(arr4)]
arr5 = arr5[~np.isnan(arr5)]
arr6 = arr6[~np.isnan(arr6)]
print(stats.ttest_ind(arr5,arr4, equal_var = False))
print(stats.ttest_ind(arr6,arr5, equal_var = False))









    



Ttest_indResult(statistic=-10.153852159747615, pvalue=5.9895165350814053e-24)
Ttest_indResult(statistic=-16.552377685289887, pvalue=1.8740538669177719e-59)

In other words, for students with non-gaming behaviour 'frac_incorrect_atts' steadily decreases with learning experience.

Other differences between groups:



In [10]:

    
stud_data_mean = stud_data.groupby('cluster_index').agg(np.mean).copy()
stud_data_mean









    Out[10]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      frac_incorrect_atts
    
    
      cluster_index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      121.623648
      7.565688
      92.571097
      358.080371
      246.780526
      0.576761
      0.110614
      0.247601
      9347.902114
      2558.552269
      1.314304
      3.920657
      0.412377
      0.456685
      357.961360
      152.899536
      0.421592
    
    
      2
      56.342711
      3.925831
      45.437340
      185.002558
      86.452685
      0.519664
      0.233178
      0.034632
      4208.397711
      1175.021739
      1.192305
      3.589256
      1.089818
      0.559777
      183.861893
      92.813299
      0.479562
    
    
      3
      14.436429
      1.691614
      13.456267
      30.439134
      2.562669
      0.583222
      0.011584
      0.002672
      1643.489630
      41.202885
      1.090756
      2.854811
      2.492266
      0.492713
      30.434626
      11.100992
      0.402156
    
    
      4
      15.246903
      2.233278
      13.523534
      51.809661
      21.730801
      0.518609
      0.017276
      0.011072
      2375.681462
      649.441164
      1.145824
      4.128678
      0.695627
      0.550509
      51.804294
      26.174236
      0.479627
    
    
      5
      62.648690
      6.227530
      55.605547
      191.468927
      68.852594
      0.559241
      0.031661
      0.023676
      8046.446327
      1703.785824
      1.157399
      3.492833
      0.071179
      0.457634
      191.396507
      88.188495
      0.440491
    
    
      6
      181.383599
      11.586681
      146.048431
      467.137589
      142.116125
      0.612408
      0.037793
      0.034232
      17310.899564
      2679.468079
      1.261961
      3.268800
      -0.006301
      0.391601
      466.381948
      183.942763
      0.386384

There is a significant increase of 'frac_3s_atts' for groups 4-5-6:



In [11]:

    
arr4 = np.array(stud_data[stud_data['cluster_index'] == 4]['frac_3s_atts'])
arr5 = np.array(stud_data[stud_data['cluster_index'] == 5]['frac_3s_atts'])
arr6 = np.array(stud_data[stud_data['cluster_index'] == 6]['frac_3s_atts'])
arr4 = arr4[~np.isnan(arr4)]
arr5 = arr5[~np.isnan(arr5)]
arr6 = arr6[~np.isnan(arr6)]
print(stats.ttest_ind(arr5,arr4, equal_var = False))
print(stats.ttest_ind(arr6,arr5, equal_var = False))









    



Ttest_indResult(statistic=16.462911716329312, pvalue=5.0996180356250809e-59)
Ttest_indResult(statistic=6.2931050213512316, pvalue=3.4739480257167589e-10)

, a significant increase of 'frac_1s_hints' for groups 4-5-6:



In [12]:

    
arr4 = np.array(stud_data[stud_data['cluster_index'] == 4]['frac_1s_hints'])
arr5 = np.array(stud_data[stud_data['cluster_index'] == 5]['frac_1s_hints'])
arr6 = np.array(stud_data[stud_data['cluster_index'] == 6]['frac_1s_hints'])
arr4 = arr4[~np.isnan(arr4)]
arr5 = arr5[~np.isnan(arr5)]
arr6 = arr6[~np.isnan(arr6)]
print(stats.ttest_ind(arr5,arr4, equal_var = False))
print(stats.ttest_ind(arr6,arr5, equal_var = False))









    



Ttest_indResult(statistic=13.252648797032892, pvalue=3.4429894092371465e-39)
Ttest_indResult(statistic=9.0112499269936261, pvalue=3.2062949164427214e-19)

, and a significant decrease of 'max_atts' for groups 4-5-6:



In [13]:

    
arr4 = np.array(stud_data[stud_data['cluster_index'] == 4]['max_atts'])
arr5 = np.array(stud_data[stud_data['cluster_index'] == 5]['max_atts'])
arr6 = np.array(stud_data[stud_data['cluster_index'] == 6]['max_atts'])
arr4 = arr4[~np.isnan(arr4)]
arr5 = arr5[~np.isnan(arr5)]
arr6 = arr6[~np.isnan(arr6)]
print(stats.ttest_ind(arr5,arr4, equal_var = False))
print(stats.ttest_ind(arr6,arr5, equal_var = False))









    



Ttest_indResult(statistic=-11.66474597325503, pvalue=6.3441131333456629e-31)
Ttest_indResult(statistic=-5.813163173033753, pvalue=6.640246772802727e-09)

Increase of max_probl_views is significant between groups 5 and 6:



In [14]:

    
arr4 = np.array(stud_data[stud_data['cluster_index'] == 4]['max_probl_views'])
arr5 = np.array(stud_data[stud_data['cluster_index'] == 5]['max_probl_views'])
arr6 = np.array(stud_data[stud_data['cluster_index'] == 6]['max_probl_views'])
arr4 = arr4[~np.isnan(arr4)]
arr5 = arr5[~np.isnan(arr5)]
arr6 = arr6[~np.isnan(arr6)]
print(stats.ttest_ind(arr5,arr4, equal_var = False))
print(stats.ttest_ind(arr6,arr5, equal_var = False))









    



Ttest_indResult(statistic=1.625027093046111, pvalue=0.1042377703597938)
Ttest_indResult(statistic=15.833103986243177, pvalue=1.9284145705097942e-54)

As we see, increasing "experience" (in group sequence 4-5-6) also leads to:

increase of "gaming" fractions 'frac_3s_atts' and 'frac_1s_hints';
increase of 'max_probl_views' (so the problems are viewed in more details);
decrease of 'max_atts' (so there are smaller attempts per problem).

Calculate mean leaning parameter and its dispersion for each group:

Because learning parameter is determined from the fit of the learning curve, it is essential to analyse learning curves starting with some reasonable minimum number of attempts.

As we see, there is a very large spread in learning parameters: between -9.97 and 24.9:



In [15]:

    
stud_data['learning_parameter'].describe()









    Out[15]:





count    8980.000000
mean        0.616437
std         3.188777
min        -9.965780
25%        -0.054653
50%         0.063602
75%         0.218565
max        24.908305
Name: learning_parameter, dtype: float64

Moreover, for ~1/3 of students learning parameter is negative:



In [16]:

    
stud_data[stud_data['learning_parameter'] < 0].shape[0]









    Out[16]:





3179

What best describes "negative learners"? First, look on extreme examples. Take "extreme negative learners" (students with learning rate < -0.5, and compare them with "extreme positive learnens" (students with learning rate > 0.5):



In [17]:

    
stud_data[stud_data['learning_parameter'] < -0.5].describe()









    Out[17]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      cluster_index
      frac_incorrect_atts
    
  
  
    
      count
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
      161.000000
    
    
      mean
      34.708075
      2.720497
      32.254658
      51.360248
      14.006211
      0.688847
      0.025639
      0.050475
      3113.850932
      371.161491
      1.092419
      2.245839
      -1.227948
      0.161278
      51.291925
      11.478261
      3.509317
      0.310729
    
    
      std
      56.546180
      2.928761
      51.954220
      78.542310
      26.625058
      0.171281
      0.055839
      0.151386
      5421.411714
      472.810955
      0.224871
      1.444326
      1.702650
      0.099337
      78.469551
      16.197256
      1.275333
      0.171545
    
    
      min
      1.000000
      1.000000
      1.000000
      2.000000
      0.000000
      0.200000
      0.000000
      0.000000
      26.000000
      0.000000
      1.000000
      1.000000
      -9.965780
      0.001000
      2.000000
      1.000000
      1.000000
      0.024793
    
    
      25%
      5.000000
      1.000000
      5.000000
      10.000000
      2.000000
      0.588235
      0.000000
      0.000000
      546.000000
      46.000000
      1.000000
      1.441860
      -1.112757
      0.102926
      10.000000
      3.000000
      3.000000
      0.181818
    
    
      50%
      11.000000
      1.000000
      11.000000
      21.000000
      5.000000
      0.727273
      0.000000
      0.000000
      961.000000
      172.000000
      1.000000
      1.705882
      -0.759733
      0.154174
      21.000000
      5.000000
      4.000000
      0.272727
    
    
      75%
      35.000000
      3.000000
      35.000000
      53.000000
      13.000000
      0.814286
      0.022222
      0.000000
      2215.000000
      487.000000
      1.093023
      2.411765
      -0.594690
      0.211333
      53.000000
      13.000000
      4.000000
      0.411765
    
    
      max
      397.000000
      13.000000
      362.000000
      588.000000
      184.000000
      0.975207
      0.333333
      1.000000
      41457.000000
      2319.000000
      2.600000
      11.000000
      -0.503115
      0.529482
      588.000000
      136.000000
      6.000000
      0.800000



In [18]:

    
stud_data[stud_data['learning_parameter'] >= 0.5].describe()









    Out[18]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      cluster_index
      frac_incorrect_atts
    
  
  
    
      count
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1118.000000
      1080.000000
    
    
      mean
      9.767442
      1.384615
      9.294275
      20.773703
      9.701252
      0.658382
      0.040541
      0.031332
      975.757603
      238.600179
      1.078327
      2.370444
      4.841834
      0.523349
      20.770125
      6.887299
      3.181574
      0.318386
    
    
      std
      13.127075
      0.965203
      12.326656
      28.717120
      28.702815
      0.254652
      0.118064
      0.109242
      1235.419496
      402.316758
      0.271104
      1.544594
      7.776401
      0.346560
      28.710326
      10.527272
      0.927130
      0.226612
    
    
      min
      1.000000
      1.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.500000
      0.001000
      0.000000
      0.000000
      1.000000
      0.000000
    
    
      25%
      2.000000
      1.000000
      2.000000
      5.000000
      1.000000
      0.541964
      0.000000
      0.000000
      185.750000
      9.000000
      1.000000
      1.200000
      0.533341
      0.252888
      5.000000
      1.000000
      3.000000
      0.166667
    
    
      50%
      6.000000
      1.000000
      5.000000
      12.000000
      3.000000
      0.666667
      0.000000
      0.000000
      644.500000
      94.500000
      1.000000
      2.000000
      0.793611
      0.520733
      12.000000
      3.000000
      3.000000
      0.315789
    
    
      75%
      11.000000
      1.000000
      11.000000
      25.000000
      9.000000
      0.833333
      0.000000
      0.000000
      1199.000000
      311.000000
      1.026931
      3.000000
      1.853239
      0.749445
      25.000000
      9.000000
      4.000000
      0.444444
    
    
      max
      121.000000
      12.000000
      116.000000
      332.000000
      661.000000
      1.000000
      1.000000
      1.000000
      10044.000000
      6247.000000
      5.000000
      12.000000
      24.908305
      1.669793
      332.000000
      121.000000
      6.000000
      1.000000

As we see, "extreme negative learners" are very similar to "extreme positive learners" in terms of 'frac_corr_atts'. However, they opened much more sessions ('num_sess'), tried to solve more problems ('num_probs'), made more attempts to solve the problems ('num_atts') and spent more time (time_atts) for solving them.



In [19]:

    
stud_data[stud_data['learning_parameter'] < -0.5].groupby('cluster_index').agg(len)['num_sess']









    Out[19]:





cluster_index
1    20
3    57
4    58
5    14
6    12
Name: num_sess, dtype: int64



In [20]:

    
stud_data[stud_data['learning_parameter'] >= 0.5].groupby('cluster_index').agg(len)['num_sess']









    Out[20]:





cluster_index
1     88
2     93
3    501
4    404
5     28
6      4
Name: num_sess, dtype: int64

Notably, most of "extreme learners" belong to groups 3 and 4 that have the smallest 'num_atts':



In [21]:

    
stud_data.groupby('cluster_index').agg(np.mean)









    Out[21]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      frac_incorrect_atts
    
    
      cluster_index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      121.623648
      7.565688
      92.571097
      358.080371
      246.780526
      0.576761
      0.110614
      0.247601
      9347.902114
      2558.552269
      1.314304
      3.920657
      0.412377
      0.456685
      357.961360
      152.899536
      0.421592
    
    
      2
      56.342711
      3.925831
      45.437340
      185.002558
      86.452685
      0.519664
      0.233178
      0.034632
      4208.397711
      1175.021739
      1.192305
      3.589256
      1.089818
      0.559777
      183.861893
      92.813299
      0.479562
    
    
      3
      14.436429
      1.691614
      13.456267
      30.439134
      2.562669
      0.583222
      0.011584
      0.002672
      1643.489630
      41.202885
      1.090756
      2.854811
      2.492266
      0.492713
      30.434626
      11.100992
      0.402156
    
    
      4
      15.246903
      2.233278
      13.523534
      51.809661
      21.730801
      0.518609
      0.017276
      0.011072
      2375.681462
      649.441164
      1.145824
      4.128678
      0.695627
      0.550509
      51.804294
      26.174236
      0.479627
    
    
      5
      62.648690
      6.227530
      55.605547
      191.468927
      68.852594
      0.559241
      0.031661
      0.023676
      8046.446327
      1703.785824
      1.157399
      3.492833
      0.071179
      0.457634
      191.396507
      88.188495
      0.440491
    
    
      6
      181.383599
      11.586681
      146.048431
      467.137589
      142.116125
      0.612408
      0.037793
      0.034232
      17310.899564
      2679.468079
      1.261961
      3.268800
      -0.006301
      0.391601
      466.381948
      183.942763
      0.386384

Together with the smallest group 2, groups 3 and 4 also correspond to the largest 'learning_parameter' variation across their members:



In [22]:

    
stud_data.groupby('cluster_index').agg(np.std)['learning_parameter']









    Out[22]:





cluster_index
1    2.575010
2    4.092726
3    6.263133
4    3.261689
5    0.904354
6    0.151328
Name: learning_parameter, dtype: float64

Note that both absolute average value and standard deviation come to zero with increasing 'num_atts': a possible manifestation of the Plateau effect.

Finally, I try to explain the observed increase of 'frac_incorrect_atts' for 5 and more attempts.

The group with the largest 'frac_incorrect_atts' is group 4 that also has the largest 'max_atts' (number of maximal attempts averaged for all assessed problems). Consequently, students from two groups with the smallest 'frac_incorrect_atts' (groups 3 and 6) also have the smallest 'max_atts':



In [23]:

    
stud_data.groupby('cluster_index').agg(np.mean)[['frac_incorrect_atts', 'max_atts']].corr()









    Out[23]:







  
    
      
      frac_incorrect_atts
      max_atts
    
  
  
    
      frac_incorrect_atts
      1.000000
      0.660316
    
    
      max_atts
      0.660316
      1.000000



In [24]:

    
stud_data.corr()['frac_incorrect_atts']['max_atts']









    Out[24]:





0.51717864000939062

As a result, students from groups 3 and 6 contribute less (and students from group 4 contribute more) to problems with large number of attempts and distort the averaged learning curve towards larger 'frac_incorrect_atts'.

Comparison with "benchmark" model (student with "gaming" vs "non-gaming" behaviour):



In [25]:

    
stud_data['gaming_index'] = stud_data['cluster_index'].replace({1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6:1})



In [26]:

    
stud_data.groupby('gaming_index').agg(np.mean)









    Out[26]:







  
    
      
      num_sess
      num_days
      num_probs
      num_atts
      num_hints
      frac_corr_atts
      frac_3s_atts
      frac_1s_hints
      time_atts
      time_hints
      max_probl_views
      max_atts
      learning_parameter
      difficulty_parameter
      number of attempts
      number of incorrect attempts
      cluster_index
      frac_incorrect_atts
    
    
      gaming_index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      106.475371
      6.721068
      81.633828
      317.918101
      209.576855
      0.563511
      0.139055
      0.198182
      8155.293081
      2237.507499
      1.285995
      3.843756
      0.569575
      0.480607
      317.562018
      138.956677
      1.232047
      0.435068
    
    
      1
      69.155449
      5.546676
      57.753393
      189.282934
      61.378341
      0.562639
      0.025360
      0.018928
      7497.856889
      1344.003427
      1.169468
      3.551144
      0.627261
      0.477355
      189.072927
      79.730226
      4.613023
      0.434249

Creating visualisation:



In [27]:

    
data = pd.read_hdf('data.hdf','test')
data.head()









    Out[27]:







  
    
      
      Anon Student Id
      Session Id
      Duration (sec)
      Student Response Type
      Problem Name
      Problem View
      Attempt At Step
      Outcome
      Day
      x
    
  
  
    
      0
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      102.0
      1
      2218
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      1
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      46.0
      0
      2218
      1.0
      2.0
      0.0
      2004-11-10
      1
    
    
      2
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      70.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      3
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      22.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      4
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      2.0
      1
      3093
      1.0
      2.0
      2.0
      2004-11-10
      0



In [28]:

    
stud_list = data['Anon Student Id'].unique()
#print(stud_list[:5])
stud_dict = {stud: cluster_index.loc[i, 1] for i, stud in enumerate(stud_list)}



In [29]:

    
stud_list[0]









    Out[29]:





'Stu_001d187b1b375fe98b88696b250177f0'



In [30]:

    
cluster_index.loc[0, 1]









    Out[30]:





1



In [31]:

    
stud_dict['Stu_001d187b1b375fe98b88696b250177f0']









    Out[31]:





1



In [32]:

    
stud_df = pd.DataFrame()
for item in stud_dict:
    #print(item, stud_dict[item])
    stud_df.loc[item, 'cluster_index'] = int(stud_dict[item])
stud_df.head()









    Out[32]:







  
    
      
      cluster_index
    
  
  
    
      Stu_001d187b1b375fe98b88696b250177f0
      1.0
    
    
      Stu_00b8a37b3ab49bfe7d7a77014d1e4cf8
      5.0
    
    
      Stu_00c6652f296f103913139157c79a856f
      6.0
    
    
      Stu_01080cce4b1a14b3fd81d684421daed4
      2.0
    
    
      Stu_0153c9b08d68e42d9a2bb5f70086df00
      1.0



In [33]:

    
data_125 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 1].index)) | \
               (data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 2].index)) | \
               (data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 5].index))]
data_125.head()









    Out[33]:







  
    
      
      Anon Student Id
      Session Id
      Duration (sec)
      Student Response Type
      Problem Name
      Problem View
      Attempt At Step
      Outcome
      Day
      x
    
  
  
    
      0
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      102.0
      1
      2218
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      1
      Stu_001d187b1b375fe98b88696b250177f0
      647501
      46.0
      0
      2218
      1.0
      2.0
      0.0
      2004-11-10
      1
    
    
      2
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      70.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      3
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      22.0
      1
      3093
      1.0
      1.0
      2.0
      2004-11-10
      0
    
    
      4
      Stu_001d187b1b375fe98b88696b250177f0
      647792
      2.0
      1
      3093
      1.0
      2.0
      2.0
      2004-11-10
      0



In [34]:

    
data_2 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 2].index))]



In [35]:

    
s1 = data[data['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2 = data[data['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1[8] = s1.loc[8:].sum()
for i in range(9, int(s1.index.max()+1)):
    try:
        s1.drop(i, inplace=True)
    except ValueError:
        pass

s2[8] = s2.loc[8:].sum()
for i in range(9, int(s2.index.max()+1)):
    try:
        s2.drop(i, inplace=True)
    except ValueError:
        pass



In [36]:

    
data_1 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 1].index))]
data_2 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 2].index))]
data_3 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 3].index))]
data_4 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 4].index))]
data_5 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 5].index))]
data_6 = data[(data['Anon Student Id'].isin(stud_df[stud_df['cluster_index'] == 6].index))]

s1_1 = data_1[data_1['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2_1 = data_1[data_1['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1_1[8] = s1_1.loc[8:].sum()
for i in range(9, int(s1_1.index.max()+1)):
    try:
        s1_1.drop(i, inplace=True)
    except ValueError:
        pass

s2_1[8] = s2_1.loc[8:].sum()
for i in range(9, int(s2_1.index.max()+1)):
    try:
        s2_1.drop(i, inplace=True)
    except ValueError:
        pass
    
s1_2 = data_2[data_2['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2_2 = data_2[data_2['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1_2[8] = s1_2.loc[8:].sum()
for i in range(9, int(s1_2.index.max()+1)):
    try:
        s1_2.drop(i, inplace=True)
    except ValueError:
        pass

s2_2[8] = s2_2.loc[8:].sum()
for i in range(9, int(s2_2.index.max()+1)):
    try:
        s2_2.drop(i, inplace=True)
    except ValueError:
        pass
    
s1_3 = data_3[data_3['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2_3 = data_3[data_3['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1_3[8] = s1_3.loc[8:].sum()
for i in range(9, int(s1_3.index.max()+1)):
    try:
        s1_3.drop(i, inplace=True)
    except ValueError:
        pass

s2_3[8] = s2_3.loc[8:].sum()
for i in range(9, int(s2_3.index.max()+1)):
    try:
        s2_3.drop(i, inplace=True)
    except ValueError:
        pass

s1_4 = data_4[data_4['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2_4 = data_4[data_4['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1_4[8] = s1_4.loc[8:].sum()
for i in range(9, int(s1_4.index.max()+1)):
    try:
        s1_4.drop(i, inplace=True)
    except ValueError:
        pass

s2_4[8] = s2_4.loc[8:].sum()
for i in range(9, int(s2_4.index.max()+1)):
    try:
        s2_4.drop(i, inplace=True)
    except ValueError:
        pass

s1_5 = data_5[data_5['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2_5 = data_5[data_5['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1_5[8] = s1_5.loc[8:].sum()
for i in range(9, int(s1_5.index.max()+1)):
    try:
        s1_5.drop(i, inplace=True)
    except ValueError:
        pass

s2_5[8] = s2_5.loc[8:].sum()
for i in range(9, int(s2_5.index.max()+1)):
    try:
        s2_5.drop(i, inplace=True)
    except ValueError:
        pass
    
s1_6 = data_6[data_6['Outcome'] <= 1].groupby(['x']).agg(len)['Problem Name']

s2_6 = data_6[data_6['Outcome'] == 1].groupby(['x']).agg(len)['Problem Name']

s1_6[8] = s1_6.loc[8:].sum()
for i in range(9, int(s1_6.index.max()+1)):
    try:
        s1_6.drop(i, inplace=True)
    except ValueError:
        pass

s2_6[8] = s2_6.loc[8:].sum()
for i in range(9, int(s2_6.index.max()+1)):
    try:
        s2_6.drop(i, inplace=True)
    except ValueError:
        pass



In [37]:

    
fig, ax1 = plt.subplots()
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 8.3
fig_size[1] = 4.7
plt.rcParams["figure.figsize"] = fig_size
plt.xlim(0.5,8.5)
plt.bar(s1.index, s1, width=0.9)
#plt.bar(s2.index, s2, width=0.9)
#plt.legend(['CORRECT', 'INCORRECT'])

plt.xlabel("Attempt number", size=14)
plt.ylabel("Number of attempts", size=14)
ax1.tick_params(axis ='both', which='major', length=0, labelsize =14, color='black')
ax1.tick_params(axis ='both', which='minor', length=0)
labels = [item.get_text() for item in ax1.get_xticklabels()]
labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8+']
#labels = ['2', '4', '6', '8+']
#print(labels)

ax2 = ax1.twinx()
ax2.plot(s1.index, s2/s1, 'r-o', linewidth=4, label='Average')
ax2.plot(s1_1.index, s2_1/s1_1, 'c-+', label='group 1')
ax2.plot(s1_2.index, s2_2/s1_2, 'b-+', label='group 2')
ax2.plot(s1_3.index, s2_3/s1_3, 'c-.', label='group 3')
ax2.plot(s1_4.index, s2_4/s1_4, 'b-.', label='group 4')
ax2.plot(s1_5.index, s2_5/s1_5, 'c-x', label='group 5')
ax2.plot(s1_6.index, s2_6/s1_6, 'b-x', label='group 6')
ax2.legend()

ax2.set_ylabel('Fraction of incorrect attempts', size=14, color='r')
ax2.tick_params('y', colors='r')
ax2.tick_params(axis ='both', which='minor', length=0)
ax2.tick_params(axis ='both', which='major', length=0, labelsize =14, color='red')

ax1.set_xticklabels(labels)

plt.show()



In [ ]:

	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts	cluster_index	frac_incorrect_atts
0	0.172964	0.577597	303	146.0	1	0.481848
1	-0.011161	0.623980	295	187.0	5	0.633898
2	-0.084896	0.459276	529	269.0	6	0.508507
3	0.044947	0.459728	1286	556.0	2	0.432348
4	0.066242	0.486793	821	369.0	1	0.449452

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts
0	89	9	79	303	213	0.518152	0.184818	0.286385	9577.000	3660.999	1.101266	3.835443
1	86	7	59	295	111	0.366102	0.071186	0.063063	10409.000	2570.000	1.610169	5.000000
2	181	10	150	529	180	0.491493	0.102079	0.077778	14850.000	2295.000	1.240000	3.526667
3	457	14	215	1288	687	0.566770	0.148292	0.066958	25290.001	7743.000	2.148837	5.990698
4	267	13	166	821	602	0.550548	0.068210	0.267442	20504.667	5347.334	1.660606	4.945783

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts	frac_incorrect_atts
cluster_index
1	157381	9790	119787	463356	319334	746.328247	143.134751	320.395430	1.209619e+07	3310766.636	1700.710013	5073.330503	533.615744	590.950677	463202	197852.0	0.427140
2	22030	1535	17766	72336	33803	203.188547	91.172770	13.541033	1.645484e+06	459433.500	466.191313	1403.399037	426.118912	218.872902	71890	36290.0	0.504799
3	16010	1876	14923	33757	2842	646.793540	12.846483	2.963157	1.822630e+06	45694.000	1209.648207	3165.984896	2763.923125	546.419214	33752	12311.0	0.364749
4	36928	5409	32754	125483	52632	1256.070663	41.842531	26.817449	5.753900e+06	1572946.500	2775.185284	9999.657523	1684.809382	1333.332278	125470	63394.0	0.505252
5	121977	12125	108264	372790	134056	1088.841940	61.644813	46.097252	1.566643e+07	3317271.000	2253.456633	6800.545131	138.585166	891.014094	372649	171703.0	0.460763
6	329574	21053	265370	848789	258225	1112.744922	68.669698	62.199569	3.145390e+07	4868593.500	2292.982438	5939.409007	-11.449252	711.539726	847416	334224.0	0.394404

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts	frac_incorrect_atts
cluster_index
1	121.623648	7.565688	92.571097	358.080371	246.780526	0.576761	0.110614	0.247601	9347.902114	2558.552269	1.314304	3.920657	0.412377	0.456685	357.961360	152.899536	0.421592
2	56.342711	3.925831	45.437340	185.002558	86.452685	0.519664	0.233178	0.034632	4208.397711	1175.021739	1.192305	3.589256	1.089818	0.559777	183.861893	92.813299	0.479562
3	14.436429	1.691614	13.456267	30.439134	2.562669	0.583222	0.011584	0.002672	1643.489630	41.202885	1.090756	2.854811	2.492266	0.492713	30.434626	11.100992	0.402156
4	15.246903	2.233278	13.523534	51.809661	21.730801	0.518609	0.017276	0.011072	2375.681462	649.441164	1.145824	4.128678	0.695627	0.550509	51.804294	26.174236	0.479627
5	62.648690	6.227530	55.605547	191.468927	68.852594	0.559241	0.031661	0.023676	8046.446327	1703.785824	1.157399	3.492833	0.071179	0.457634	191.396507	88.188495	0.440491
6	181.383599	11.586681	146.048431	467.137589	142.116125	0.612408	0.037793	0.034232	17310.899564	2679.468079	1.261961	3.268800	-0.006301	0.391601	466.381948	183.942763	0.386384

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts	cluster_index	frac_incorrect_atts
count	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000	161.000000
mean	34.708075	2.720497	32.254658	51.360248	14.006211	0.688847	0.025639	0.050475	3113.850932	371.161491	1.092419	2.245839	-1.227948	0.161278	51.291925	11.478261	3.509317	0.310729
std	56.546180	2.928761	51.954220	78.542310	26.625058	0.171281	0.055839	0.151386	5421.411714	472.810955	0.224871	1.444326	1.702650	0.099337	78.469551	16.197256	1.275333	0.171545
min	1.000000	1.000000	1.000000	2.000000	0.000000	0.200000	0.000000	0.000000	26.000000	0.000000	1.000000	1.000000	-9.965780	0.001000	2.000000	1.000000	1.000000	0.024793
25%	5.000000	1.000000	5.000000	10.000000	2.000000	0.588235	0.000000	0.000000	546.000000	46.000000	1.000000	1.441860	-1.112757	0.102926	10.000000	3.000000	3.000000	0.181818
50%	11.000000	1.000000	11.000000	21.000000	5.000000	0.727273	0.000000	0.000000	961.000000	172.000000	1.000000	1.705882	-0.759733	0.154174	21.000000	5.000000	4.000000	0.272727
75%	35.000000	3.000000	35.000000	53.000000	13.000000	0.814286	0.022222	0.000000	2215.000000	487.000000	1.093023	2.411765	-0.594690	0.211333	53.000000	13.000000	4.000000	0.411765
max	397.000000	13.000000	362.000000	588.000000	184.000000	0.975207	0.333333	1.000000	41457.000000	2319.000000	2.600000	11.000000	-0.503115	0.529482	588.000000	136.000000	6.000000	0.800000

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts	cluster_index	frac_incorrect_atts
count	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1118.000000	1080.000000
mean	9.767442	1.384615	9.294275	20.773703	9.701252	0.658382	0.040541	0.031332	975.757603	238.600179	1.078327	2.370444	4.841834	0.523349	20.770125	6.887299	3.181574	0.318386
std	13.127075	0.965203	12.326656	28.717120	28.702815	0.254652	0.118064	0.109242	1235.419496	402.316758	0.271104	1.544594	7.776401	0.346560	28.710326	10.527272	0.927130	0.226612
min	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.500000	0.001000	0.000000	0.000000	1.000000	0.000000
25%	2.000000	1.000000	2.000000	5.000000	1.000000	0.541964	0.000000	0.000000	185.750000	9.000000	1.000000	1.200000	0.533341	0.252888	5.000000	1.000000	3.000000	0.166667
50%	6.000000	1.000000	5.000000	12.000000	3.000000	0.666667	0.000000	0.000000	644.500000	94.500000	1.000000	2.000000	0.793611	0.520733	12.000000	3.000000	3.000000	0.315789
75%	11.000000	1.000000	11.000000	25.000000	9.000000	0.833333	0.000000	0.000000	1199.000000	311.000000	1.026931	3.000000	1.853239	0.749445	25.000000	9.000000	4.000000	0.444444
max	121.000000	12.000000	116.000000	332.000000	661.000000	1.000000	1.000000	1.000000	10044.000000	6247.000000	5.000000	12.000000	24.908305	1.669793	332.000000	121.000000	6.000000	1.000000

	num_sess	num_days	num_probs	num_atts	num_hints	frac_corr_atts	frac_3s_atts	frac_1s_hints	time_atts	time_hints	max_probl_views	max_atts	learning_parameter	difficulty_parameter	number of attempts	number of incorrect attempts	cluster_index	frac_incorrect_atts
gaming_index
0	106.475371	6.721068	81.633828	317.918101	209.576855	0.563511	0.139055	0.198182	8155.293081	2237.507499	1.285995	3.843756	0.569575	0.480607	317.562018	138.956677	1.232047	0.435068
1	69.155449	5.546676	57.753393	189.282934	61.378341	0.562639	0.025360	0.018928	7497.856889	1344.003427	1.169468	3.551144	0.627261	0.477355	189.072927	79.730226	4.613023	0.434249

	Anon Student Id	Session Id	Duration (sec)	Student Response Type	Problem Name	Problem View	Attempt At Step	Outcome	Day	x
0	Stu_001d187b1b375fe98b88696b250177f0	647501	102.0	1	2218	1.0	1.0	2.0	2004-11-10	0
1	Stu_001d187b1b375fe98b88696b250177f0	647501	46.0	0	2218	1.0	2.0	0.0	2004-11-10	1
2	Stu_001d187b1b375fe98b88696b250177f0	647792	70.0	1	3093	1.0	1.0	2.0	2004-11-10	0
3	Stu_001d187b1b375fe98b88696b250177f0	647792	22.0	1	3093	1.0	1.0	2.0	2004-11-10	0
4	Stu_001d187b1b375fe98b88696b250177f0	647792	2.0	1	3093	1.0	2.0	2.0	2004-11-10	0

	cluster_index
Stu_001d187b1b375fe98b88696b250177f0	1.0
Stu_00b8a37b3ab49bfe7d7a77014d1e4cf8	5.0
Stu_00c6652f296f103913139157c79a856f	6.0
Stu_01080cce4b1a14b3fd81d684421daed4	2.0
Stu_0153c9b08d68e42d9a2bb5f70086df00	1.0