In [ ]:
In [13]:
from __future__ import print_function, division
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import thinkbayes2
import thinkplot
import statsmodels.formula.api as smf
In [14]:
df = pd.read_csv('heri17.csv', skiprows=2, index_col='year')
df[df.columns] /= 10
df.head()
Out[14]:
In [20]:
df['time'] = df.index - 1966
df['time2'] = df.time**2
In [21]:
def MakeErrorModel(df, y, formula, n=100):
"""Makes a model that captures sample error and residual error.
df: DataFrame
y: Series
formula: string representation of the regression model
n: number of simulations to run
returns: (fittedvalues, sample_error, total_error)
"""
# make the best fit
df['y'] = y
results = smf.ols(formula, data=df).fit()
fittedvalues = results.fittedvalues
resid = results.resid
# permute residuals and generate hypothetical fits
fits = []
for i in range(n):
df['y'] = fittedvalues + np.random.permutation(results.resid)
fake_results = smf.ols(formula, data=df).fit()
fits.append(fake_results.fittedvalues)
# compute the variance of the fits
fits = np.array(fits)
sample_var = fits.var(axis=0)
# add sample_var and the variance of the residuals
total_var = sample_var + resid.var()
# standard errors are square roots of the variances
return fittedvalues, np.sqrt(sample_var), np.sqrt(total_var)
In [22]:
def FillBetween(fittedvalues, stderr, **options):
"""Fills in the 95% confidence interval.
fittedvalues: series
stderr: standard error
"""
low = fittedvalues - 2 * stderr
high = fittedvalues + 2 * stderr
thinkplot.FillBetween(fittedvalues.index, low, high, **options)
In [67]:
def PlotModel(y, fittedvalues, sample_error, total_error, **options):
"""Plots confidence intervals and the actual data
"""
FillBetween(fittedvalues, total_error, color='0.9')
FillBetween(fittedvalues, sample_error, color='0.7')
thinkplot.Plot(fittedvalues, color='0.5')
thinkplot.Plot(y, **options)
In [68]:
def Plot(df, y, formula, **options):
fittedvalues, sample_error, total_error = MakeErrorModel(df, y, formula)
PlotModel(y, fittedvalues, sample_error, total_error, **options)
thinkplot.Config(xlim=[1965, 2017])
In [90]:
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk', font_scale=1.3)
current_palette = sns.color_palette()
sns.palplot(current_palette)
BLUE, GREEN, RED, PURPLE, YELLOW, SKY = current_palette
In [99]:
formula = 'y ~ time + time2'
y = df.noneall
Plot(df, y, formula, color=BLUE, label='None')
thinkplot.Config(ylabel='Percent', loc='upper left')
In [100]:
ps = df.noneall / 100
odds = ps / (1-ps)
log_odds = np.log(odds)
log_odds
Plot(df, log_odds, formula, color=BLUE, label='None')
thinkplot.Config(ylabel='Log odds')
In [102]:
attend = 100-df.attendedall
Plot(df, attend, formula, color=GREEN, label='No attendance')
thinkplot.Config(ylabel='Percent')
In [104]:
diff = df.nonemen - df.nonewomen
diff = diff.loc[1973:]
Plot(df, diff, formula, color=PURPLE, label='Gender gap')
thinkplot.Config(ylabel='Difference (percentage points)')
In [105]:
diff = df.nonemen - df.nonewomen
diff = diff.loc[1986:]
Plot(df, diff, formula, color=PURPLE, label='Gender gap')
thinkplot.Config(ylabel='Difference (percentage points)')
In [106]:
diff = df.nonemen - df.nonewomen
diff = diff.loc[1986:]
Plot(df, diff, 'y ~ time', color=PURPLE, label='Gender gap')
thinkplot.Config(ylabel='Difference (percentage points)')
In [ ]:
In [ ]:
In [ ]: