In [1]:
import sys
import pandas as pd
import numpy as np
import difflib
import gzip
from scipy import stats

In [4]:
filename = sys.argv[1]
filename = "searches.json"
searches = pd.read_json(filename,orient='records', lines=True)
even_samples = searches[searches['uid'] % 2 == 0]
odd_samples = searches[searches['uid'] % 2 != 0]
odd_samples


Out[4]:
is_instructor login_count search_count uid
0 True 1 2 6061521
1 False 4 0 11986457
2 False 1 0 15995765
4 False 1 0 9882383
6 False 2 0 3583107
7 True 1 0 11760157
8 False 3 0 13150423
9 False 4 2 8004609
10 False 1 3 13536345
12 False 6 0 9613243
13 True 14 0 12986377
14 True 1 0 9792541
16 False 7 0 13320527
18 False 1 0 12260537
19 True 7 0 14915463
20 True 7 1 8228927
22 False 1 0 5936973
23 True 4 0 8373533
25 False 4 2 9185533
28 False 6 0 14133079
29 False 1 0 4716739
30 True 4 4 12339949
32 True 2 0 14485631
36 True 6 0 11858437
40 False 8 0 15701977
42 False 1 0 13578367
43 False 1 3 8473959
44 True 6 0 4952803
45 False 2 0 10476521
46 False 3 0 6625293
... ... ... ... ...
615 False 2 1 12709365
616 False 1 0 5543603
617 True 5 0 351701
619 True 2 1 5454807
620 True 4 0 6312797
621 True 2 0 9036705
626 False 6 6 4068379
631 False 2 1 16164763
633 True 3 0 8026425
636 False 1 0 14240595
637 True 5 0 3013413
638 False 2 0 1668679
640 True 1 0 16619457
648 False 3 4 9220815
652 False 1 0 5864293
653 False 1 1 9164081
654 False 2 3 14780717
659 False 2 2 5696405
663 False 4 0 5958561
666 False 2 0 1475209
668 True 3 0 8698521
669 False 1 0 10141047
670 False 1 2 1572895
671 True 2 0 6954951
672 False 1 0 12231197
673 False 4 0 2849545
674 False 2 0 6810415
677 True 1 0 7643715
678 False 1 0 14838641
679 False 2 0 6454817

348 rows × 4 columns


In [ ]:
p_more_users = stats.mannwhitneyu(even_samples['search_count'],odd_samples['search_count']).pvalue
p_more_users

In [ ]:
even_searched = even_samples[even_samples['search_count'] != 0].shape[0]
even_unsearched = even_samples[even_samples['search_count'] == 0].shape[0]
odd_searched = odd_samples[odd_samples['search_count'] != 0].shape[0]
odd_unsearched = odd_samples[odd_samples['search_count'] == 0].shape[0]

contingency = [[even_searched, even_unsearched], [odd_searched, odd_unsearched]]
chi2, p_more_searches, dof, expected = stats.chi2_contingency(contingency)

p_more_searches

In [ ]:
inst_samples = searches[searches['is_instructor']]
inst_even_samples = inst_samples[inst_samples['uid'] % 2 == 0]
inst_odd_samples = inst_samples[inst_samples['uid'] % 2 != 0]
inst_odd_samples

In [ ]:
p_more_instr = stats.mannwhitneyu(inst_even_samples['search_count'],inst_odd_samples['search_count']).pvalue
p_more_instr

In [ ]:
inst_even_searched = inst_even_samples[inst_even_samples['search_count'] != 0].shape[0]
inst_even_unsearched = inst_even_samples[inst_even_samples['search_count'] == 0].shape[0]
inst_odd_searched = inst_odd_samples[inst_odd_samples['search_count'] != 0].shape[0]
inst_odd_unsearched = inst_odd_samples[inst_odd_samples['search_count'] == 0].shape[0]

inst_contingency = [[inst_even_searched, inst_even_unsearched], [inst_odd_searched, inst_odd_unsearched]]
inst_chi2, p_more_instr_searches, inst_dof, inst_expected = stats.chi2_contingency(inst_contingency)

In [3]:
def main():
    
    OUTPUT_TEMPLATE = (
        '"Did more/less users use the search feature?" p-value: {more_users_p:.3g}\n'
        '"Did users search more/less?" p-value: {more_searches_p:.3g}\n'
        '"Did more/less instructors use the search feature?" p-value: {more_instr_p:.3g}\n'
        '"Did instructors search more/less?" p-value: {more_instr_searches_p:.3g}'
    )
#     searchdata_file = sys.argv[1]

    # ...
    
    filename = sys.argv[1]
#     filename = "searches.json"
    searches = pd.read_json(filename,orient='records', lines=True)
    even_samples = searches[searches['uid'] % 2 == 0]
    odd_samples = searches[searches['uid'] % 2 != 0]
    
    even_searched = even_samples[even_samples['search_count'] != 0].shape[0]
    even_unsearched = even_samples[even_samples['search_count'] == 0].shape[0]
    odd_searched = odd_samples[odd_samples['search_count'] != 0].shape[0]
    odd_unsearched = odd_samples[odd_samples['search_count'] == 0].shape[0]

    p_more_searches = stats.mannwhitneyu(even_samples['search_count'],odd_samples['search_count']).pvalue

    contingency = [[even_searched, even_unsearched], [odd_searched, odd_unsearched]]
    chi2, p_more_users, dof, expected = stats.chi2_contingency(contingency)
    
    inst_samples = searches[searches['is_instructor']]
    inst_even_samples = inst_samples[inst_samples['uid'] % 2 == 0]
    inst_odd_samples = inst_samples[inst_samples['uid'] % 2 != 0]
    
    p_more_instr_searches = stats.mannwhitneyu(inst_even_samples['search_count'],inst_odd_samples['search_count']).pvalue
    

    inst_even_searched = inst_even_samples[inst_even_samples['search_count'] != 0].shape[0]
    inst_even_unsearched = inst_even_samples[inst_even_samples['search_count'] == 0].shape[0]
    inst_odd_searched = inst_odd_samples[inst_odd_samples['search_count'] != 0].shape[0]
    inst_odd_unsearched = inst_odd_samples[inst_odd_samples['search_count'] == 0].shape[0]

    inst_contingency = [[inst_even_searched, inst_even_unsearched], [inst_odd_searched, inst_odd_unsearched]]
    inst_chi2, p_more_instr, inst_dof, inst_expected = stats.chi2_contingency(inst_contingency)
    
    # Output
    print(OUTPUT_TEMPLATE.format(
        more_users_p=p_more_users,
        more_searches_p=p_more_searches,
        more_instr_p=p_more_instr,
        more_instr_searches_p=p_more_instr_searches,
    ))
    
if __name__ == '__main__':
    main()


"Did more/less users use the search feature?" p-value: 0.168
"Did users search more/less?" p-value: 0.0706
"Did more/less instructors use the search feature?" p-value: 0.052
"Did instructors search more/less?" p-value: 0.0225