notebook.community

Edit and run



In [1]:

    
import sys
import pandas as pd
import numpy as np
import difflib
import gzip
from scipy import stats



In [4]:

    
filename = sys.argv[1]
filename = "searches.json"
searches = pd.read_json(filename,orient='records', lines=True)
even_samples = searches[searches['uid'] % 2 == 0]
odd_samples = searches[searches['uid'] % 2 != 0]
odd_samples









    Out[4]:






  
    
      
      is_instructor
      login_count
      search_count
      uid
    
  
  
    
      0
      True
      1
      2
      6061521
    
    
      1
      False
      4
      0
      11986457
    
    
      2
      False
      1
      0
      15995765
    
    
      4
      False
      1
      0
      9882383
    
    
      6
      False
      2
      0
      3583107
    
    
      7
      True
      1
      0
      11760157
    
    
      8
      False
      3
      0
      13150423
    
    
      9
      False
      4
      2
      8004609
    
    
      10
      False
      1
      3
      13536345
    
    
      12
      False
      6
      0
      9613243
    
    
      13
      True
      14
      0
      12986377
    
    
      14
      True
      1
      0
      9792541
    
    
      16
      False
      7
      0
      13320527
    
    
      18
      False
      1
      0
      12260537
    
    
      19
      True
      7
      0
      14915463
    
    
      20
      True
      7
      1
      8228927
    
    
      22
      False
      1
      0
      5936973
    
    
      23
      True
      4
      0
      8373533
    
    
      25
      False
      4
      2
      9185533
    
    
      28
      False
      6
      0
      14133079
    
    
      29
      False
      1
      0
      4716739
    
    
      30
      True
      4
      4
      12339949
    
    
      32
      True
      2
      0
      14485631
    
    
      36
      True
      6
      0
      11858437
    
    
      40
      False
      8
      0
      15701977
    
    
      42
      False
      1
      0
      13578367
    
    
      43
      False
      1
      3
      8473959
    
    
      44
      True
      6
      0
      4952803
    
    
      45
      False
      2
      0
      10476521
    
    
      46
      False
      3
      0
      6625293
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      615
      False
      2
      1
      12709365
    
    
      616
      False
      1
      0
      5543603
    
    
      617
      True
      5
      0
      351701
    
    
      619
      True
      2
      1
      5454807
    
    
      620
      True
      4
      0
      6312797
    
    
      621
      True
      2
      0
      9036705
    
    
      626
      False
      6
      6
      4068379
    
    
      631
      False
      2
      1
      16164763
    
    
      633
      True
      3
      0
      8026425
    
    
      636
      False
      1
      0
      14240595
    
    
      637
      True
      5
      0
      3013413
    
    
      638
      False
      2
      0
      1668679
    
    
      640
      True
      1
      0
      16619457
    
    
      648
      False
      3
      4
      9220815
    
    
      652
      False
      1
      0
      5864293
    
    
      653
      False
      1
      1
      9164081
    
    
      654
      False
      2
      3
      14780717
    
    
      659
      False
      2
      2
      5696405
    
    
      663
      False
      4
      0
      5958561
    
    
      666
      False
      2
      0
      1475209
    
    
      668
      True
      3
      0
      8698521
    
    
      669
      False
      1
      0
      10141047
    
    
      670
      False
      1
      2
      1572895
    
    
      671
      True
      2
      0
      6954951
    
    
      672
      False
      1
      0
      12231197
    
    
      673
      False
      4
      0
      2849545
    
    
      674
      False
      2
      0
      6810415
    
    
      677
      True
      1
      0
      7643715
    
    
      678
      False
      1
      0
      14838641
    
    
      679
      False
      2
      0
      6454817
    
  

348 rows × 4 columns



In [ ]:

    
p_more_users = stats.mannwhitneyu(even_samples['search_count'],odd_samples['search_count']).pvalue
p_more_users



In [ ]:

    
even_searched = even_samples[even_samples['search_count'] != 0].shape[0]
even_unsearched = even_samples[even_samples['search_count'] == 0].shape[0]
odd_searched = odd_samples[odd_samples['search_count'] != 0].shape[0]
odd_unsearched = odd_samples[odd_samples['search_count'] == 0].shape[0]

contingency = [[even_searched, even_unsearched], [odd_searched, odd_unsearched]]
chi2, p_more_searches, dof, expected = stats.chi2_contingency(contingency)

p_more_searches



In [ ]:

    
inst_samples = searches[searches['is_instructor']]
inst_even_samples = inst_samples[inst_samples['uid'] % 2 == 0]
inst_odd_samples = inst_samples[inst_samples['uid'] % 2 != 0]
inst_odd_samples



In [ ]:

    
p_more_instr = stats.mannwhitneyu(inst_even_samples['search_count'],inst_odd_samples['search_count']).pvalue
p_more_instr



In [ ]:

    
inst_even_searched = inst_even_samples[inst_even_samples['search_count'] != 0].shape[0]
inst_even_unsearched = inst_even_samples[inst_even_samples['search_count'] == 0].shape[0]
inst_odd_searched = inst_odd_samples[inst_odd_samples['search_count'] != 0].shape[0]
inst_odd_unsearched = inst_odd_samples[inst_odd_samples['search_count'] == 0].shape[0]

inst_contingency = [[inst_even_searched, inst_even_unsearched], [inst_odd_searched, inst_odd_unsearched]]
inst_chi2, p_more_instr_searches, inst_dof, inst_expected = stats.chi2_contingency(inst_contingency)



In [3]:

    
def main():
    
    OUTPUT_TEMPLATE = (
        '"Did more/less users use the search feature?" p-value: {more_users_p:.3g}\n'
        '"Did users search more/less?" p-value: {more_searches_p:.3g}\n'
        '"Did more/less instructors use the search feature?" p-value: {more_instr_p:.3g}\n'
        '"Did instructors search more/less?" p-value: {more_instr_searches_p:.3g}'
    )
#     searchdata_file = sys.argv[1]

    # ...
    
    filename = sys.argv[1]
#     filename = "searches.json"
    searches = pd.read_json(filename,orient='records', lines=True)
    even_samples = searches[searches['uid'] % 2 == 0]
    odd_samples = searches[searches['uid'] % 2 != 0]
    
    even_searched = even_samples[even_samples['search_count'] != 0].shape[0]
    even_unsearched = even_samples[even_samples['search_count'] == 0].shape[0]
    odd_searched = odd_samples[odd_samples['search_count'] != 0].shape[0]
    odd_unsearched = odd_samples[odd_samples['search_count'] == 0].shape[0]

    p_more_searches = stats.mannwhitneyu(even_samples['search_count'],odd_samples['search_count']).pvalue

    contingency = [[even_searched, even_unsearched], [odd_searched, odd_unsearched]]
    chi2, p_more_users, dof, expected = stats.chi2_contingency(contingency)
    
    inst_samples = searches[searches['is_instructor']]
    inst_even_samples = inst_samples[inst_samples['uid'] % 2 == 0]
    inst_odd_samples = inst_samples[inst_samples['uid'] % 2 != 0]
    
    p_more_instr_searches = stats.mannwhitneyu(inst_even_samples['search_count'],inst_odd_samples['search_count']).pvalue
    

    inst_even_searched = inst_even_samples[inst_even_samples['search_count'] != 0].shape[0]
    inst_even_unsearched = inst_even_samples[inst_even_samples['search_count'] == 0].shape[0]
    inst_odd_searched = inst_odd_samples[inst_odd_samples['search_count'] != 0].shape[0]
    inst_odd_unsearched = inst_odd_samples[inst_odd_samples['search_count'] == 0].shape[0]

    inst_contingency = [[inst_even_searched, inst_even_unsearched], [inst_odd_searched, inst_odd_unsearched]]
    inst_chi2, p_more_instr, inst_dof, inst_expected = stats.chi2_contingency(inst_contingency)
    
    # Output
    print(OUTPUT_TEMPLATE.format(
        more_users_p=p_more_users,
        more_searches_p=p_more_searches,
        more_instr_p=p_more_instr,
        more_instr_searches_p=p_more_instr_searches,
    ))
    
if __name__ == '__main__':
    main()









    



"Did more/less users use the search feature?" p-value: 0.168
"Did users search more/less?" p-value: 0.0706
"Did more/less instructors use the search feature?" p-value: 0.052
"Did instructors search more/less?" p-value: 0.0225

	is_instructor	login_count	search_count	uid
0	True	1	2	6061521
1	False	4	0	11986457
2	False	1	0	15995765
4	False	1	0	9882383
6	False	2	0	3583107
7	True	1	0	11760157
8	False	3	0	13150423
9	False	4	2	8004609
10	False	1	3	13536345
12	False	6	0	9613243
13	True	14	0	12986377
14	True	1	0	9792541
16	False	7	0	13320527
18	False	1	0	12260537
19	True	7	0	14915463
20	True	7	1	8228927
22	False	1	0	5936973
23	True	4	0	8373533
25	False	4	2	9185533
28	False	6	0	14133079
29	False	1	0	4716739
30	True	4	4	12339949
32	True	2	0	14485631
36	True	6	0	11858437
40	False	8	0	15701977
42	False	1	0	13578367
43	False	1	3	8473959
44	True	6	0	4952803
45	False	2	0	10476521
46	False	3	0	6625293
...	...	...	...	...
615	False	2	1	12709365
616	False	1	0	5543603
617	True	5	0	351701
619	True	2	1	5454807
620	True	4	0	6312797
621	True	2	0	9036705
626	False	6	6	4068379
631	False	2	1	16164763
633	True	3	0	8026425
636	False	1	0	14240595
637	True	5	0	3013413
638	False	2	0	1668679
640	True	1	0	16619457
648	False	3	4	9220815
652	False	1	0	5864293
653	False	1	1	9164081
654	False	2	3	14780717
659	False	2	2	5696405
663	False	4	0	5958561
666	False	2	0	1475209
668	True	3	0	8698521
669	False	1	0	10141047
670	False	1	2	1572895
671	True	2	0	6954951
672	False	1	0	12231197
673	False	4	0	2849545
674	False	2	0	6810415
677	True	1	0	7643715
678	False	1	0	14838641
679	False	2	0	6454817