notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3 as sql
import scipy.stats as stats

df = pd.read_csv("data_rate_data_avg_watched.csv")



In [3]:

    
df.head()









    Out[3]:






  
    
      
      course
      student
      course_complete
      clickrate
      avgpctvideowatched
      rw
      ff
    
  
  
    
      0
      genomics
      0000d488dca916ef0d5efdac10e052b2
      0
      0.033203
      0.054831
      0.000000
      0.000000
    
    
      1
      genomics
      00057f037dc72ff77d5ca29de69d0671
      0
      0.019955
      0.106250
      0.000000
      2.171084
    
    
      2
      genomics
      0015b68bd020c6caf6ab4cc0159ac2f0
      1
      0.015138
      0.240743
      0.000000
      0.012065
    
    
      3
      genomics
      00296e85f055b03c0882626c80a24306
      0
      0.234637
      0.000000
      0.000000
      0.000000
    
    
      4
      genomics
      0044739ebeb7116d34cc8796ea784c5a
      0
      0.010198
      0.126693
      1.822567
      0.000000



In [4]:

    
df.describe()









    Out[4]:






  
    
      
      course_complete
      clickrate
      avgpctvideowatched
      rw
      ff
    
  
  
    
      count
      19668.000000
      19668.000000
      19668.000000
      19668.000000
      19668.000000
    
    
      mean
      0.121619
      0.180099
      0.142266
      1.342241
      0.607770
    
    
      std
      0.326853
      0.232294
      0.133769
      40.886485
      3.557908
    
    
      min
      0.000000
      0.002157
      -0.000033
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.028020
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      0.085932
      0.122945
      0.000000
      0.000000
    
    
      75%
      0.000000
      0.213239
      0.248148
      0.000000
      0.000000
    
    
      max
      1.000000
      1.200000
      0.861459
      5525.100419
      156.368678



In [5]:

    
Completed = df[df['course_complete'] ==1 ]
NotComplete = df[df['course_complete'] == 0]



In [6]:

    
Completed.head()









    Out[6]:






  
    
      
      course
      student
      course_complete
      clickrate
      avgpctvideowatched
      rw
      ff
    
  
  
    
      2
      genomics
      0015b68bd020c6caf6ab4cc0159ac2f0
      1
      0.015138
      0.240743
      0.000000
      0.012065
    
    
      9
      genomics
      006d949a7713901bc778028e605bacc5
      1
      0.013639
      0.218629
      0.000000
      2.991179
    
    
      13
      genomics
      00959c868c027ce70d18304e1f7b4d84
      1
      0.011750
      0.016324
      10.275888
      0.000000
    
    
      25
      genomics
      00ec9281233d1bcfa97f2129d010d895
      1
      0.016534
      0.209335
      0.000000
      2.360829
    
    
      26
      genomics
      00fca1dc037815e36b50459eef8c498d
      1
      0.009738
      0.116762
      101.751894
      0.000000



In [7]:

    
len(Completed)









    Out[7]:





2392



In [8]:

    
len(NotComplete)









    Out[8]:





17276



In [15]:

    
TestVariable = "ff"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]









    



Population mean:  0.607769651553
Mean rate of Complete:  1.64495621337
Mean rate of NotComplete:  0.464162899072
Differnce in means (Completed - NotComplete):  1.1807933143
Equal population variance (t-statistic, two-tailed p-value):  (15.302434108547361, 1.4793844467299373e-52)
Unequal population variance (t-statistic, two-tailed p-value):  (10.916686214685534, 3.5823761707804019e-27)



In [14]:

    
TestVariable = "rw"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]









    



Population mean:  1.34224091266
Mean rate of Complete:  2.38266652718
Mean rate of NotComplete:  1.19818568749
Differnce in means (Completed - NotComplete):  1.18448083969
Equal population variance (t-statistic, two-tailed p-value):  (1.3279410099045779, 0.18421300752955727)
Unequal population variance (t-statistic, two-tailed p-value):  (2.3238826024905692, 0.020162513118256047)



In [13]:

    
TestVariable = "clickrate"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]









    



Population mean:  0.180099190655
Mean rate of Complete:  0.016780890275
Mean rate of NotComplete:  0.202711912032
Differnce in means (Completed - NotComplete):  -0.185931021757
Equal population variance (t-statistic, two-tailed p-value):  (-38.011913395661026, 3.7230381069195898e-305)
Unequal population variance (t-statistic, two-tailed p-value):  (-101.92581942634403, 0.0)



In [12]:

    
TestVariable = "avgpctvideowatched"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]









    



Population mean:  0.142266455302
Mean rate of Complete:  0.180751859195
Mean rate of NotComplete:  0.136937844159
Differnce in means (Completed - NotComplete):  0.0438140150362
Equal population variance (t-statistic, two-tailed p-value):  (15.099798850993963, 3.1475097187787351e-51)
Unequal population variance (t-statistic, two-tailed p-value):  (18.28677884514892, 1.6735554938927209e-71)



In [ ]:

    
#Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
#This is a two-sided test for the null hypothesis that 2 independent samples 
#have identical average (expected) values. This test assumes that the populations have identical variances by default.
#Returns (The calculated t-statistic, The two-tailed p-value)
#If we observe a large p-value, for example larger than 0.05 or 0.1,
#then we cannot reject the null hypothesis of identical average scores. 
#If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%,
#then we reject the null hypothesis of equal averages.



In [ ]:

	course	student	course_complete	clickrate	avgpctvideowatched	rw	ff
0	genomics	0000d488dca916ef0d5efdac10e052b2	0	0.033203	0.054831	0.000000	0.000000
1	genomics	00057f037dc72ff77d5ca29de69d0671	0	0.019955	0.106250	0.000000	2.171084
2	genomics	0015b68bd020c6caf6ab4cc0159ac2f0	1	0.015138	0.240743	0.000000	0.012065
3	genomics	00296e85f055b03c0882626c80a24306	0	0.234637	0.000000	0.000000	0.000000
4	genomics	0044739ebeb7116d34cc8796ea784c5a	0	0.010198	0.126693	1.822567	0.000000

	course_complete	clickrate	avgpctvideowatched	rw	ff
count	19668.000000	19668.000000	19668.000000	19668.000000	19668.000000
mean	0.121619	0.180099	0.142266	1.342241	0.607770
std	0.326853	0.232294	0.133769	40.886485	3.557908
min	0.000000	0.002157	-0.000033	0.000000	0.000000
25%	0.000000	0.028020	0.000000	0.000000	0.000000
50%	0.000000	0.085932	0.122945	0.000000	0.000000
75%	0.000000	0.213239	0.248148	0.000000	0.000000
max	1.000000	1.200000	0.861459	5525.100419	156.368678