In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3 as sql
import scipy.stats as stats
df = pd.read_csv("data_rate_data_avg_watched.csv")
In [3]:
df.head()
Out[3]:
In [4]:
df.describe()
Out[4]:
In [5]:
Completed = df[df['course_complete'] ==1 ]
NotComplete = df[df['course_complete'] == 0]
In [6]:
Completed.head()
Out[6]:
In [7]:
len(Completed)
Out[7]:
In [8]:
len(NotComplete)
Out[8]:
In [15]:
TestVariable = "ff"
MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])
P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)
print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]
In [14]:
TestVariable = "rw"
MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])
P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)
print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]
In [13]:
TestVariable = "clickrate"
MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])
P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)
print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]
In [12]:
TestVariable = "avgpctvideowatched"
MeanDifference = np.mean(Completed[TestVariable])- np.mean(NotComplete[TestVariable])
PopulationMean = np.mean(df[TestVariable])
P1 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist())
P2 = stats.ttest_ind(Completed[TestVariable].tolist(), NotComplete[TestVariable].tolist(), equal_var=False)
print "Population mean: ", PopulationMean
print "Mean rate of Complete: ", np.mean(Completed[TestVariable])
print "Mean rate of NotComplete: ", np.mean(NotComplete[TestVariable])
print "Differnce in means (Completed - NotComplete): ", MeanDifference
print "Equal population variance (t-statistic, two-tailed p-value): ", P1
print "Unequal population variance (t-statistic, two-tailed p-value): ", P2
#print 'ttest:', P1, ['Not Different', 'Different'][P1 < 0.05]
In [ ]:
#Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
#This is a two-sided test for the null hypothesis that 2 independent samples
#have identical average (expected) values. This test assumes that the populations have identical variances by default.
#Returns (The calculated t-statistic, The two-tailed p-value)
#If we observe a large p-value, for example larger than 0.05 or 0.1,
#then we cannot reject the null hypothesis of identical average scores.
#If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%,
#then we reject the null hypothesis of equal averages.
In [ ]: