In [172]:
%matplotlib inline
from matplotlib import pyplot
from collections import defaultdict
from pandas.io.parsers import read_csv
import numpy as np
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 6.0)
users = read_csv('mxit_app_data/users.csv')
users


Out[172]:
id difficulty joined_date
0 844 0 2013-03-22 22:05:06.558104
1 496 0 2013-03-22 19:13:46.616819
2 442 0 2013-03-22 19:07:07.354484
3 4982 0 2013-03-24 04:38:10.260852
4 389 0 2013-03-22 16:09:26.660893
5 3700 0 2013-03-23 17:45:36.277398
6 504 0 2013-03-22 19:15:03.7253
7 340 0 2013-03-22 14:04:55.166372
8 3991 0 2013-03-23 18:03:28.821379
9 3710 0 2013-03-23 17:46:31.91972
10 1205 0 2013-03-23 08:32:22.656537
11 235 0 2013-03-22 11:13:41.154889
12 572 0 2013-03-22 19:56:50.605375
13 349 0 2013-03-22 15:02:21.492953
14 350 0 2013-03-22 15:04:00.628481
15 18982 0 2013-03-27 10:50:18.940815
16 5065 1 2013-03-24 04:58:32.097637
17 2304 1 2013-03-23 13:47:33.43667
18 4725 1 2013-03-24 00:15:15.234946
19 3724 0 2013-03-23 17:47:27.78501
20 3102 0 2013-03-23 17:11:29.048225
21 4335 0 2013-03-23 20:44:46.152268
22 3730 0 2013-03-23 17:48:09.400463
23 5067 0 2013-03-24 04:58:36.718086
24 3589 0 2013-03-23 17:36:57.885904
25 5921 0 2013-03-24 08:26:33.97443
26 3702 1 2013-03-23 17:45:49.396004
27 171 1 2013-03-19 08:47:31.653852
28 421 0 2013-03-22 19:01:59.989886
29 4277 0 2013-03-23 20:02:41.21152
30 13119 0 2013-03-26 16:16:47.543859
31 196332 0 2013-04-01 10:38:22.618019
32 4453 1 2013-03-23 22:02:59.485494
33 20667 0 2013-03-27 14:38:30.095444
34 7077 1 2013-03-25 05:14:02.785449
35 3052 0 2013-03-23 17:05:39.152712
36 537 0 2013-03-22 19:41:44.836526
37 3166 1 2013-03-23 17:14:39.212579
38 539 0 2013-03-22 19:44:31.584511
39 487 0 2013-03-22 19:12:19.502427
40 4422 0 2013-03-23 21:38:04.510255
41 15177 0 2013-03-26 20:54:17.370454
42 173629 0 2013-03-31 12:31:25.42536
43 10036 0 2013-03-26 07:15:09.092938
44 5244 0 2013-03-24 05:11:01.539898
45 10599 0 2013-03-26 08:56:12.111275
46 4236 0 2013-03-23 19:39:03.663182
47 194299 0 2013-04-01 08:04:58.029678
48 551 0 2013-03-22 19:52:23.094559
49 12013 0 2013-03-26 11:43:47.317621
50 4114 0 2013-03-23 18:12:18.392465
51 1 2 2013-03-17 17:56:24.200194
52 1012 0 2013-03-22 22:32:05.623714
53 7457 0 2013-03-25 09:09:01.530775
54 1131 0 2013-03-23 04:26:16.64766
55 619 2 2013-03-22 20:24:24.4887
56 6226 0 2013-03-24 14:41:51.661997
57 10444 0 2013-03-26 08:20:37.034959
58 9968 0 2013-03-26 06:47:35.000955
59 3918 0 2013-03-23 17:58:34.416884
... ... ...

109657 rows × 3 columns


In [13]:
quiz = read_csv('mxit_app_data/quiz.csv')
quiz


Out[13]:
id type start_time user_id num_correct num_questions score end_time actual_type difficulty
0 1 ADDSUB 2013-03-17 20:40:49.464713 1 10 10 113 2013-03-17 20:41:34.611539 NaN NaN
1 89 ADDSUB 2013-03-22 20:54:49.607938 619 16 16 221 2013-03-22 20:55:52.882108 NaN NaN
2 2 ADDSUB 2013-03-17 20:41:51.66946 1 14 14 185 2013-03-17 20:42:37.660208 NaN NaN
3 3 ADDSUB 2013-03-17 20:43:08.362717 1 16 17 172 2013-03-17 20:43:54.15152 NaN NaN
4 4 ADDSUB 2013-03-17 22:09:53.287065 57 NaN NaN NaN NaN NaN NaN
5 91 ADDSUB 2013-03-22 21:10:07.617334 619 NaN NaN NaN NaN NaN NaN
6 5 ADDSUB 2013-03-17 22:10:27.866362 57 4 5 23 2013-03-17 22:11:13.615152 NaN NaN
7 90 ADDSUB 2013-03-22 21:10:01.620397 1 11 14 87 2013-03-22 21:11:06.305229 NaN NaN
8 6 ADDSUB 2013-03-17 22:11:26.236149 57 5 6 28 2013-03-17 22:12:15.820946 NaN NaN
9 7 ADDSUB 2013-03-17 22:12:26.158765 57 5 6 28 2013-03-17 22:13:14.283572 NaN NaN
10 92 ADDSUB 2013-03-22 22:08:45.530633 844 6 6 51 2013-03-22 22:09:51.74258 NaN NaN
11 8 ADDSUB 2013-03-17 22:23:27.831931 57 5 5 38 2013-03-17 22:24:21.845104 NaN NaN
12 9 ADDSUB 2013-03-17 22:24:41.972419 57 3 4 12 2013-03-17 22:25:28.02375 NaN NaN
13 93 ADDSUB 2013-03-22 22:11:22.171433 854 9 9 96 2013-03-22 22:12:22.244127 NaN NaN
14 10 ADDSUB 2013-03-18 12:54:08.021775 110 4 5 23 2013-03-18 12:54:56.365366 NaN NaN
15 11 ADDSUB 2013-03-18 13:07:48.2762 110 10 10 113 2013-03-18 13:08:37.299659 NaN NaN
16 94 MULTDIV 2013-03-22 22:12:45.381974 854 11 13 89 2013-03-22 22:13:49.901741 NaN NaN
17 12 ADDSUB 2013-03-18 13:10:13.059232 110 1 1 5 2013-03-18 13:22:20.719731 NaN NaN
18 13 ADDSUB 2013-03-18 14:03:38.70135 110 NaN NaN NaN NaN NaN NaN
19 14 ADDSUB 2013-03-18 14:03:58.3142 110 2 2 10 2013-03-18 14:04:48.295541 NaN NaN
20 95 ADDSUB 2013-03-22 22:14:16.787428 854 4 6 14 2013-03-22 22:15:17.152252 NaN NaN
21 15 ADDSUB 2013-03-18 14:05:09.590011 110 8 8 80 2013-03-18 14:05:58.142921 NaN NaN
22 16 ADDSUB 2013-03-19 08:45:59.029557 1 NaN NaN NaN NaN NaN NaN
23 17 ADDSUB 2013-03-19 08:47:44.908187 171 11 12 115 2013-03-19 08:48:30.686596 NaN NaN
24 96 ADDSUB 2013-03-22 22:15:46.49154 854 9 10 93 2013-03-22 22:16:49.172615 NaN NaN
25 18 ADDSUB 2013-03-19 08:49:27.128623 171 16 17 166 2013-03-19 08:50:12.480035 NaN NaN
26 19 ADDSUB 2013-03-22 07:46:46.422598 171 NaN NaN NaN NaN NaN NaN
27 20 MULTDIV 2013-03-22 11:13:54.159148 235 2 2 10 2013-03-22 11:14:46.732206 NaN NaN
28 97 ADDSUB 2013-03-22 22:17:32.989813 854 6 8 49 2013-03-22 22:18:35.080507 NaN NaN
29 21 MULTDIV 2013-03-22 11:16:13.157169 235 4 4 26 2013-03-22 11:16:58.408537 NaN NaN
30 22 MULTDIV 2013-03-22 11:17:44.389222 235 5 5 38 2013-03-22 11:18:46.025286 NaN NaN
31 98 ADDSUB 2013-03-22 22:18:57.095654 854 5 9 23 2013-03-22 22:20:01.715979 NaN NaN
32 23 MULTDIV 2013-03-22 11:19:10.11531 235 4 4 26 2013-03-22 11:19:55.481373 NaN NaN
33 24 MULTDIV 2013-03-22 11:21:19.034572 235 4 5 17 2013-03-22 11:22:05.705905 NaN NaN
34 99 ADDSUB 2013-03-22 22:20:31.371153 854 7 8 52 2013-03-22 22:21:33.004724 NaN NaN
35 25 MULTDIV 2013-03-22 11:22:56.805365 235 3 3 15 2013-03-22 11:23:47.917552 NaN NaN
36 26 ADDSUB 2013-03-22 11:24:07.058136 235 5 5 38 2013-03-22 11:24:55.167197 NaN NaN
37 100 MULTDIV 2013-03-22 22:21:49.389028 854 8 10 63 2013-03-22 22:22:49.420179 NaN NaN
38 27 MULTDIV 2013-03-22 11:26:36.602767 235 0 0 NaN 2013-03-22 11:27:22.520249 NaN NaN
39 28 MULTDIV 2013-03-22 11:27:40.525866 235 4 5 17 2013-03-22 11:28:25.700874 NaN NaN
40 29 MULTDIV 2013-03-22 14:05:40.534346 340 NaN NaN NaN NaN NaN NaN
41 101 ADDSUB 2013-03-22 22:23:06.106945 854 6 10 36 2013-03-22 22:24:07.980081 NaN NaN
42 30 ADDSUB 2013-03-22 14:05:59.11865 340 3 3 15 2013-03-22 14:06:48.085766 NaN NaN
43 31 MULTDIV 2013-03-22 15:32:45.340653 350 NaN NaN NaN NaN NaN NaN
44 32 ADDSUB 2013-03-22 15:34:20.363189 350 NaN NaN NaN NaN NaN NaN
45 33 ADDSUB 2013-03-22 15:41:36.596025 350 3 3 15 2013-03-22 15:42:24.173528 NaN NaN
46 34 ADDSUB 2013-03-22 15:42:42.096865 350 NaN NaN NaN NaN NaN NaN
47 35 MULTDIV 2013-03-22 15:44:13.459856 350 NaN NaN NaN NaN NaN NaN
48 102 MULTDIV 2013-03-22 22:25:07.626963 854 7 10 61 2013-03-22 22:26:10.092549 NaN NaN
49 36 MULTDIV 2013-03-22 15:44:35.097614 350 4 4 26 2013-03-22 15:45:20.913882 NaN NaN
50 37 ADDSUB 2013-03-22 15:46:00.074565 350 NaN NaN NaN NaN NaN NaN
51 103 ADDSUB 2013-03-22 22:26:20.708535 854 NaN NaN NaN NaN NaN NaN
52 38 ADDSUB 2013-03-22 16:09:40.43819 389 4 4 26 2013-03-22 16:10:33.853386 NaN NaN
53 39 MULTDIV 2013-03-22 16:10:47.882595 389 NaN NaN NaN NaN NaN NaN
54 40 ADDSUB 2013-03-22 16:11:40.896385 389 5 5 38 2013-03-22 16:12:26.437844 NaN NaN
55 41 MULTDIV 2013-03-22 16:13:09.794813 389 NaN NaN NaN NaN NaN NaN
56 104 MULTDIV 2013-03-22 22:32:23.331686 1012 2 4 4 2013-03-22 22:33:29.79083 NaN NaN
57 42 ADDSUB 2013-03-22 19:00:33.783355 417 1 1 5 2013-03-22 19:01:27.005604 NaN NaN
58 43 MULTDIV 2013-03-22 19:02:54.860664 421 NaN NaN NaN NaN NaN NaN
59 44 ADDSUB 2013-03-22 19:04:59.208456 421 NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ...

539518 rows × 10 columns

Scatter plot of scores over time


In [14]:
user_count = defaultdict(int)
for q in quiz.iterrows():
    if q[1].get('end_time'):
        user_count[q[1].get('user_id')]+=1
    #user_scores[q.get('user_id')].append(q.get('score'))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-e1ed063b8c1b> in <module>()
      5     #user_scores[q.get('user_id')].append(q.get('score'))
      6 
----> 7 print sort(user_count.itervalues())[:20]

NameError: name 'sort' is not defined

In [136]:
counts = list((a[1], a[0]) for a in user_count.iteritems() )
counts.sort(reverse=True, key=lambda x: x[0])
print counts[:20]


[(645, 248399), (538, 212046), (537, 88901), (523, 121044), (470, 214050), (439, 261554), (372, 854), (372, 199513), (351, 216041), (347, 213612), (327, 258010), (310, 59195), (308, 244635), (305, 206742), (299, 210738), (297, 1), (287, 207980), (285, 259404), (280, 275932), (277, 233261)]

In [72]:
scores_data = read_csv('mxit_app_data/scores.csv')

all_user_scores = defaultdict(list)
for s in scores_data.iterrows():
    all_user_scores[s[1].user_id].append((s[1].score, s[1].quiz_type, s[1].difficulty))

def user_scores(user_id):
    user_scores = []
    for s in scores_data.iterrows():
        if s[1].user_id == user_id:
            user_scores.append((s[1].score, s[1].quiz_type, s[1].difficulty))
    return user_scores

In [186]:
import math

for figure, i in enumerate((c[1] for c in counts[:200])):
    
    for d in range(3):
        ys = [s[0] for s in all_user_scores[i] if not math.isnan( s[0]) and s[1] == 'all' and s[2] == d]
        
        r = arange(len(ys))
        y = array(ys)
        
        if len(ys)<20:
            continue
            
        slope, intercept, r_value, p_value, std_err = stats.linregress(r,y)

        if r_value > 0.2: #r_value < 0.6:
            continue
            
        var = stats.nanstd(y)
            
        print r_value
        
        fig = pyplot.figure(figure*2+d-1)
        ax = fig.add_axes([0,0,1,1])
        pyplot.text(0.65, 0.15, 'r=%.3f' % r_value, transform=ax.transAxes, fontsize=16)
        pyplot.text(0.65, 0.1, 'user id: %d' % i, transform=ax.transAxes, fontsize=16)
        pyplot.text(0.65, 0.2, 'std deviation: %.2f' % var, transform=ax.transAxes, fontsize=16)
        pyplot.title('Score over time')
        pyplot.ylabel('Score')
        pyplot.xlabel('Game index')
        pyplot.plot(r, slope*r+intercept, color='r', linewidth=2.0)
        pyplot.scatter(range(len(ys)), ys)
        pyplot.savefig('%d-%d.png' % (i, d), bbox_inches='tight')
        #pyplot.show()


0.0953681456427
-0.313633168712
0.0523604722914
0.121468572582
0.00585013699151
-0.104127247061
-0.0223360728748
0.18920595467
0.0673937701994
-0.00247952130082
-0.115385229737
0.190924235937
-0.120428075771
0.0384286286173
0.175365203222
-0.111689819043
0.0369369079574
0.084666957355
0.182638417102
0.189844656261
0.111448805449
-0.243747280662
0.161180864248
-0.307321847882
0.0692284079983
-0.0753664391109
0.179489768763
0.122076217027
-0.168367127724
-0.0699940834709
0.136744513497
-0.165239704928
0.167700887468
-0.282029024961
0.0399361947975
-0.213239793962
0.0797768162953
0.0609354761054
0.01700946658
-0.0206412420699
-0.113640388684
0.0259401172529
0.175136975576
0.0269938011518
-0.0782314911425
0.160636509606
0.150730214326
0.191217701837
-0.463324965538
-0.315204830013
0.101886500319
-0.146661406366

In [101]:
from numpy import arange,array,ones
from pylab import plot,show
from scipy import stats

xi = arange(len(ys)) 
y = array(ys)

slope, intercept, r_value, p_value, std_err = stats.linregress(xi,y)

print slope, intercept, r_value, p_value, std_err


0.460085402627 59.1945237083 0.676490123883 1.44767487785e-25 0.0374364578062

In [ ]: