In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3 as sql
import scipy.stats as stats

df = pd.read_csv("data_rate_data_avg_watched.csv")

In [3]:
df.head()


Out[3]:
course student course_complete clickrate avgpctvideowatched rw ff
0 genomics 0000d488dca916ef0d5efdac10e052b2 0 0.033203 0.054831 0.000000 0.000000
1 genomics 00057f037dc72ff77d5ca29de69d0671 0 0.019955 0.106250 0.000000 2.171084
2 genomics 0015b68bd020c6caf6ab4cc0159ac2f0 1 0.015138 0.240743 0.000000 0.012065
3 genomics 00296e85f055b03c0882626c80a24306 0 0.234637 0.000000 0.000000 0.000000
4 genomics 0044739ebeb7116d34cc8796ea784c5a 0 0.010198 0.126693 1.822567 0.000000

In [4]:
df.describe()


Out[4]:
course_complete clickrate avgpctvideowatched rw ff
count 19668.000000 19668.000000 19668.000000 19668.000000 19668.000000
mean 0.121619 0.180099 0.142266 1.342241 0.607770
std 0.326853 0.232294 0.133769 40.886485 3.557908
min 0.000000 0.002157 -0.000033 0.000000 0.000000
25% 0.000000 0.028020 0.000000 0.000000 0.000000
50% 0.000000 0.085932 0.122945 0.000000 0.000000
75% 0.000000 0.213239 0.248148 0.000000 0.000000
max 1.000000 1.200000 0.861459 5525.100419 156.368678

In [5]:
Completed = df[df['course_complete'] ==1 ]
NotComplete = df[df['course_complete'] == 0]

In [6]:
Completed.head()


Out[6]:
course student course_complete clickrate avgpctvideowatched rw ff
2 genomics 0015b68bd020c6caf6ab4cc0159ac2f0 1 0.015138 0.240743 0.000000 0.012065
9 genomics 006d949a7713901bc778028e605bacc5 1 0.013639 0.218629 0.000000 2.991179
13 genomics 00959c868c027ce70d18304e1f7b4d84 1 0.011750 0.016324 10.275888 0.000000
25 genomics 00ec9281233d1bcfa97f2129d010d895 1 0.016534 0.209335 0.000000 2.360829
26 genomics 00fca1dc037815e36b50459eef8c498d 1 0.009738 0.116762 101.751894 0.000000

In [7]:
len(Completed)


Out[7]:
2392

In [8]:
len(NotComplete)


Out[8]:
17276

In [15]:
TestVariable = "ff"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]


Population mean:  0.607769651553
Mean rate of Complete:  1.64495621337
Mean rate of NotComplete:  0.464162899072
Differnce in means (Completed - NotComplete):  1.1807933143
Equal population variance (t-statistic, two-tailed p-value):  (15.302434108547361, 1.4793844467299373e-52)
Unequal population variance (t-statistic, two-tailed p-value):  (10.916686214685534, 3.5823761707804019e-27)

In [14]:
TestVariable = "rw"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]


Population mean:  1.34224091266
Mean rate of Complete:  2.38266652718
Mean rate of NotComplete:  1.19818568749
Differnce in means (Completed - NotComplete):  1.18448083969
Equal population variance (t-statistic, two-tailed p-value):  (1.3279410099045779, 0.18421300752955727)
Unequal population variance (t-statistic, two-tailed p-value):  (2.3238826024905692, 0.020162513118256047)

In [13]:
TestVariable = "clickrate"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]


Population mean:  0.180099190655
Mean rate of Complete:  0.016780890275
Mean rate of NotComplete:  0.202711912032
Differnce in means (Completed - NotComplete):  -0.185931021757
Equal population variance (t-statistic, two-tailed p-value):  (-38.011913395661026, 3.7230381069195898e-305)
Unequal population variance (t-statistic, two-tailed p-value):  (-101.92581942634403, 0.0)

In [12]:
TestVariable = "avgpctvideowatched"

MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])

P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)

print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]


Population mean:  0.142266455302
Mean rate of Complete:  0.180751859195
Mean rate of NotComplete:  0.136937844159
Differnce in means (Completed - NotComplete):  0.0438140150362
Equal population variance (t-statistic, two-tailed p-value):  (15.099798850993963, 3.1475097187787351e-51)
Unequal population variance (t-statistic, two-tailed p-value):  (18.28677884514892, 1.6735554938927209e-71)

In [ ]:
#Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
#This is a two-sided test for the null hypothesis that 2 independent samples 
#have identical average (expected) values. This test assumes that the populations have identical variances by default.
#Returns (The calculated t-statistic, The two-tailed p-value)
#If we observe a large p-value, for example larger than 0.05 or 0.1,
#then we cannot reject the null hypothesis of identical average scores. 
#If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%,
#then we reject the null hypothesis of equal averages.

In [ ]: