notebook.community

Edit and run



In [ ]:



In [13]:

    
from __future__ import print_function, division

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import thinkbayes2
import thinkplot

import statsmodels.formula.api as smf



In [14]:

    
df = pd.read_csv('heri17.csv', skiprows=2, index_col='year')
df[df.columns] /= 10
df.head()









    Out[14]:






  
    
      
      noneall
      fatherall
      motherall
      attendedall
      nonemen
      fathermen
      mothermen
      attendedmen
      nonewomen
      fatherwomen
      motherwomen
      attendedwomen
      bornagain
      evangelical
    
    
      year
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1966
      NaN
      NaN
      NaN
      66.1
      NaN
      NaN
      NaN
      59.0
      NaN
      NaN
      NaN
      74.0
      NaN
      NaN
    
    
      1967
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1968
      NaN
      NaN
      NaN
      91.7
      NaN
      NaN
      NaN
      90.0
      NaN
      NaN
      NaN
      93.7
      NaN
      NaN
    
    
      1969
      13.6
      NaN
      NaN
      91.0
      15.7
      NaN
      NaN
      89.2
      2.3
      NaN
      NaN
      93.0
      NaN
      NaN
    
    
      1970
      10.7
      NaN
      3.1
      89.0
      11.9
      NaN
      2.8
      87.4
      9.1
      NaN
      3.3
      90.9
      NaN
      NaN



In [20]:

    
df['time'] = df.index - 1966
df['time2'] = df.time**2



In [21]:

    
def MakeErrorModel(df, y, formula, n=100):
    """Makes a model that captures sample error and residual error.

    df: DataFrame
    y: Series
    formula: string representation of the regression model
    n:     number of simulations to run

    returns: (fittedvalues, sample_error, total_error)
    """
    # make the best fit
    df['y'] = y
    results = smf.ols(formula, data=df).fit()
    fittedvalues = results.fittedvalues
    resid = results.resid    

    # permute residuals and generate hypothetical fits
    fits = []
    for i in range(n):
        df['y'] = fittedvalues + np.random.permutation(results.resid)
        fake_results = smf.ols(formula, data=df).fit()
        fits.append(fake_results.fittedvalues)

    # compute the variance of the fits
    fits = np.array(fits)
    sample_var = fits.var(axis=0)
    
    # add sample_var and the variance of the residuals
    total_var = sample_var + resid.var()

    # standard errors are square roots of the variances
    return fittedvalues, np.sqrt(sample_var), np.sqrt(total_var)



In [22]:

    
def FillBetween(fittedvalues, stderr, **options):
    """Fills in the 95% confidence interval.
    
    fittedvalues: series
    stderr: standard error
    """
    low = fittedvalues - 2 * stderr
    high = fittedvalues + 2 * stderr
    thinkplot.FillBetween(fittedvalues.index, low, high, **options)



In [67]:

    
def PlotModel(y, fittedvalues, sample_error, total_error, **options):
    """Plots confidence intervals and the actual data
    """
    FillBetween(fittedvalues, total_error, color='0.9')
    FillBetween(fittedvalues, sample_error, color='0.7')
    thinkplot.Plot(fittedvalues, color='0.5')
    thinkplot.Plot(y, **options)



In [68]:

    
def Plot(df, y, formula, **options):

    fittedvalues, sample_error, total_error = MakeErrorModel(df, y, formula)
    PlotModel(y, fittedvalues, sample_error, total_error, **options)

    thinkplot.Config(xlim=[1965, 2017])



In [90]:

    
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk', font_scale=1.3)

current_palette = sns.color_palette()
sns.palplot(current_palette)
BLUE, GREEN, RED, PURPLE, YELLOW, SKY = current_palette



In [99]:

    
formula = 'y ~ time + time2'
y = df.noneall
Plot(df, y, formula, color=BLUE, label='None')
thinkplot.Config(ylabel='Percent', loc='upper left')



In [100]:

    
ps = df.noneall / 100
odds = ps / (1-ps)
log_odds = np.log(odds)
log_odds
Plot(df, log_odds, formula, color=BLUE, label='None')
thinkplot.Config(ylabel='Log odds')



In [102]:

    
attend = 100-df.attendedall
Plot(df, attend, formula, color=GREEN, label='No attendance')
thinkplot.Config(ylabel='Percent')



In [104]:

    
diff = df.nonemen - df.nonewomen
diff = diff.loc[1973:]
Plot(df, diff, formula, color=PURPLE, label='Gender gap')
thinkplot.Config(ylabel='Difference (percentage points)')



In [105]:

    
diff = df.nonemen - df.nonewomen
diff = diff.loc[1986:]
Plot(df, diff, formula, color=PURPLE, label='Gender gap')
thinkplot.Config(ylabel='Difference (percentage points)')



In [106]:

    
diff = df.nonemen - df.nonewomen
diff = diff.loc[1986:]
Plot(df, diff, 'y ~ time', color=PURPLE, label='Gender gap')
thinkplot.Config(ylabel='Difference (percentage points)')



In [ ]:



In [ ]:



In [ ]:

	noneall	fatherall	motherall	attendedall	nonemen	fathermen	mothermen	attendedmen	nonewomen	fatherwomen	motherwomen	attendedwomen	bornagain	evangelical
year
1966	NaN	NaN	NaN	66.1	NaN	NaN	NaN	59.0	NaN	NaN	NaN	74.0	NaN	NaN
1967	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1968	NaN	NaN	NaN	91.7	NaN	NaN	NaN	90.0	NaN	NaN	NaN	93.7	NaN	NaN
1969	13.6	NaN	NaN	91.0	15.7	NaN	NaN	89.2	2.3	NaN	NaN	93.0	NaN	NaN
1970	10.7	NaN	3.1	89.0	11.9	NaN	2.8	87.4	9.1	NaN	3.3	90.9	NaN	NaN