You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.


Distributions in Pandas


In [ ]:
import pandas as pd
import numpy as np

In [ ]:
np.random.binomial(1, 0.5)

In [ ]:
np.random.binomial(1000, 0.5)/1000

In [ ]:
chance_of_tornado = 0.01/100
np.random.binomial(100000, chance_of_tornado)

In [ ]:
chance_of_tornado = 0.01

tornado_events = np.random.binomial(1, chance_of_tornado, 1000000)
    
two_days_in_a_row = 0
for j in range(1,len(tornado_events)-1):
    if tornado_events[j]==1 and tornado_events[j-1]==1:
        two_days_in_a_row+=1

print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))

In [ ]:
np.random.uniform(0, 1)

In [ ]:
np.random.normal(0.75)

Formula for standard deviation $$\sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \overline{x})^2}$$


In [ ]:
distribution = np.random.normal(0.75,size=1000)

np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))

In [ ]:
np.std(distribution)

In [ ]:
import scipy.stats as stats
stats.kurtosis(distribution)

In [ ]:
stats.skew(distribution)

In [ ]:
chi_squared_df2 = np.random.chisquare(2, size=10000)
stats.skew(chi_squared_df2)

In [ ]:
chi_squared_df5 = np.random.chisquare(5, size=10000)
stats.skew(chi_squared_df5)

In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step', 
                  label=['2 degrees of freedom','5 degrees of freedom'])
plt.legend(loc='upper right')

Hypothesis Testing


In [ ]:
df = pd.read_csv('grades.csv')

In [ ]:
df.head()

In [ ]:
len(df)

In [ ]:
early = df[df['assignment1_submission'] <= '2015-12-31']
late = df[df['assignment1_submission'] > '2015-12-31']

In [ ]:
early.mean()

In [ ]:
late.mean()

In [ ]:
from scipy import stats
stats.ttest_ind?

In [ ]:
stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])

In [ ]:
stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])

In [ ]:
stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])