This notebook presents a second-pass analysis of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org).
Copyright 2015 Allen Downey
MIT License: http://opensource.org/licenses/MIT
In [1]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import thinkstats2
import thinkplot
import statsmodels.formula.api as smf
from iso_country_codes import COUNTRY
%matplotlib inline
The following function selects the columns I need.
In [2]:
def read_cycle(filename):
"""Reads a file containing ESS data and selects columns.
filename: string
returns: DataFrame
"""
df = pd.read_stata(filename, convert_categoricals=False)
if 'hinctnta' not in df.columns:
df['hinctnta'] = df.hinctnt
if 'inwyr' not in df.columns:
df['inwyr'] = df.inwyye
cols = ['cntry', 'inwyr', 'tvtot', 'tvpol', 'rdtot', 'rdpol',
'nwsptot', 'nwsppol', 'netuse',
'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn',
'eisced', 'pspwght', 'pweight']
df = df[cols]
return df
Read data from Cycle 1.
In [3]:
df1 = read_cycle('ESS1e06_4.dta')
df1.head()
Out[3]:
Read data from Cycle 2.
In [4]:
df2 = read_cycle('ESS2e03_4.dta')
df2.head()
Out[4]:
Read data from Cycle 3.
In [5]:
df3 = read_cycle('ESS3e03_5.dta')
df3.head()
Out[5]:
Read data from Cycle 4.
In [6]:
df4 = read_cycle('ESS4e04_3.dta')
df4.head()
Out[6]:
Read data from Cycle 5.
In [7]:
df5 = read_cycle('ESS5e03_2.dta')
df5.head()
Out[7]:
In [8]:
def clean_cycle(df):
"""Cleans data from one cycle.
df: DataFrame
"""
df.tvtot.replace([77, 88, 99], np.nan, inplace=True)
df.rdtot.replace([77, 88, 99], np.nan, inplace=True)
df.nwsptot.replace([77, 88, 99], np.nan, inplace=True)
df.netuse.replace([77, 88, 99], np.nan, inplace=True)
df.tvpol.replace([66, 77, 88, 99], np.nan, inplace=True)
df.rdpol.replace([66, 77, 88, 99], np.nan, inplace=True)
df.nwsppol.replace([66, 77, 88, 99], np.nan, inplace=True)
df.eduyrs.replace([77, 88, 99], np.nan, inplace=True)
df.rlgblg.replace([7, 8, 9], np.nan, inplace=True)
df.rlgdgr.replace([77, 88, 99], np.nan, inplace=True)
df.hinctnta.replace([77, 88, 99], np.nan, inplace=True)
df.yrbrn.replace([7777, 8888, 9999], np.nan, inplace=True)
df.inwyr.replace([9999], np.nan, inplace=True)
df['hasrelig'] = (df.rlgblg==1).astype(int)
df.loc[df.rlgblg.isnull(), 'hasrelig'] = np.nan
df['yrbrn60'] = df.yrbrn - 1960
df['inwyr07'] = df.inwyr - 2007 + np.random.uniform(-0.5, 0.5, len(df))
In [9]:
cycles = [df1, df2, df3, df4, df5]
for cycle in cycles:
clean_cycle(cycle)
In [10]:
def resample(df):
"""Resample data by country.
df: DataFrame
returns: map from country code to DataFrame
"""
res = {}
grouped = df.groupby('cntry')
for name, group in grouped:
sample = group.sample(len(group), weights=group.pspwght, replace=True)
sample.index = range(len(group))
res[name] = sample
return res
# each cycle_map is a map from country code to DataFrame
cycle_maps = [resample(cycle) for cycle in cycles]
for cycle_map in cycle_maps:
print(len(cycle_map), 'countries')
In a each cycle, a few questions were not asked in some countries.
In [11]:
def check_variables(name, group):
"""Print variables missing from a group.
name: group name (country code)
group: DataFrame
"""
varnames = ['cntry', 'tvtot', 'tvpol', 'rdtot', 'rdpol',
'nwsptot', 'nwsppol', 'netuse', 'inwyr07',
'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta',
'yrbrn', 'pspwght', 'pweight']
for var in varnames:
n = len(group[var].dropna())
if (n < 100):
print(name, var, len(group[var].dropna()))
for i, cycle_map in enumerate(cycle_maps):
print('Cycle', i+1)
for name, group in cycle_map.items():
check_variables(name, group)
I'll remove the cycle/country groups that are missing netuse or rlgblg.
The ones that are missing income information undermine the ability to control for income, but it's a pretty weak effect, so I don't think it's a problem.
inwtr07 is interview year shifted by 1970, the approximate mean.
In [12]:
del cycle_maps[0]['FR']
del cycle_maps[0]['DE']
del cycle_maps[1]['FR']
del cycle_maps[1]['FI']
ee = cycle_maps[4]['EE']
ee.inwyr07 = 3 + np.random.uniform(-0.5, 0.5, len(ee))
Income is reported on a different scale in different cycles, and differs substantially across countries. So I'm replacing it with rank on a per-country basis: that is, each respondent is mapped to their income rank, from 0 to 1, within their country.
I do the same thing with years of education, in part because there are some suspiciously high values. At the very least, mapping to rank reduces the effect of outliers.
In [13]:
def replace_var_with_rank(name, df, old, new):
"""Replaces a scale variable with a rank from 0-1.
Creates a new column.
name: country code
df: DataFrame
old: old variable name
new: new variable name
"""
# jitter the data
series = df[old] + np.random.uniform(-0.25, 0.25, len(df))
# if there's no data, just put in random values
if len(series.dropna()) < 10:
df[new] = np.random.random(len(df))
return
# map from values to ranks
cdf = thinkstats2.Cdf(series)
df[new] = cdf.Probs(series)
# make sure NaN maps to NaN
df.loc[df[old].isnull(), new] = np.nan
def replace_with_ranks(cycle_map):
"""Replace variables within countries.
cycle_map: map from country code to DataFrame
"""
for name, group in cycle_map.items():
replace_var_with_rank(name, group, 'hinctnta', 'hincrank')
replace_var_with_rank(name, group, 'eduyrs', 'edurank')
for cycle_map in cycle_maps:
replace_with_ranks(cycle_map)
To fill missing values, I am drawing random samples from the available values, on a per-country basis.
Since I am not modeling relationships between variables, I am losing some information with this replacement policy; the net effect is to understate the actual strength of all relationships.
In [14]:
def fill_var(df, old, new):
"""Fills missing values.
Creates a new column
df: DataFrame
old: old variable name
new: new variable name
"""
# find the NaN rows
null = df[df[old].isnull()]
# sample from the non-NaN rows
fill = df[old].dropna().sample(len(null), replace=True)
fill.index = null.index
# replace NaNs with the random sample
df[new] = df[old].fillna(fill)
def fill_all_vars(df):
"""Fills missing values in the variables we need.
df: DataFrame
"""
for old in ['hasrelig', 'rlgdgr', 'yrbrn60', 'edurank', 'hincrank',
'tvtot', 'rdtot', 'nwsptot', 'netuse', 'inwyr07']:
new = old + '_f'
fill_var(df, old, new)
In [15]:
def fill_vars_by_country(cycle_map):
for name, group in cycle_map.items():
fill_all_vars(group)
for cycle_map in cycle_maps:
fill_vars_by_country(cycle_map)
Concatenate the cycles.
In [16]:
def concat_groups(cycle_map):
"""Concat all countries in a cycle.
cycle_map: map from country code to DataFrame
returns: DataFrame
"""
return pd.concat(cycle_map.values(), ignore_index=True)
dfs = [concat_groups(cycle_map) for cycle_map in cycle_maps]
for df in dfs:
print(len(df))
In [17]:
df = pd.concat(dfs, ignore_index=True)
print(df.shape)
df.head()
Out[17]:
TV watching time on average weekday
In [18]:
df.tvtot.value_counts().sort_index()
Out[18]:
Radio listening, total time on average weekday.
In [19]:
df.rdtot.value_counts().sort_index()
Out[19]:
Newspaper reading, total time on average weekday.
In [20]:
df.nwsptot.value_counts().sort_index()
Out[20]:
Personal use of Internet, email, www
In [21]:
df.netuse.value_counts().sort_index()
Out[21]:
Belong to a particular religion or denomination
In [22]:
df.rlgblg.value_counts().sort_index()
Out[22]:
How religious
In [23]:
df.rlgdgr.value_counts().sort_index()
Out[23]:
Total household net income, all sources
In [24]:
df.hincrank.describe()
Out[24]:
Year born
In [25]:
df.yrbrn.describe()
Out[25]:
In [26]:
cdf = thinkstats2.Cdf(df.yrbrn)
thinkplot.PrePlot(1)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Year born', ylabel='CDF',
title='Disrtribution of year born', legend=False)
Shifted to mean near 0
In [27]:
df.yrbrn60.describe()
Out[27]:
Number of years of education, converted to ranks.
In [28]:
df.edurank.describe()
Out[28]:
Country codes
In [29]:
df.cntry.value_counts().sort_index()
Out[29]:
In [30]:
df.rlgdgr.value_counts().sort_index()
Out[30]:
In [31]:
df.inwyr07.describe()
Out[31]:
Run the model
In [32]:
def run_model(df, formula):
model = smf.logit(formula, data=df)
results = model.fit(disp=False)
return results
Here's the model with all control variables and all media variables:
In [33]:
formula = ('hasrelig ~ yrbrn60 + edurank + hincrank +'
'tvtot + rdtot + nwsptot + netuse')
res = run_model(df, formula)
res.summary()
Out[33]:
Now using the filled variables
In [34]:
formula = ('hasrelig_f ~ yrbrn60_f + edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
res = run_model(df, formula)
res.summary()
Out[34]:
Now adding inwyr07
In [35]:
formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
res = run_model(df, formula)
res.summary()
Out[35]:
In [36]:
def extract_res(res, var='netuse_f'):
param = res.params[var]
pvalue = res.pvalues[var]
stars = '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else ''
return res.nobs, param, stars
extract_res(res)
Out[36]:
Group by country:
In [37]:
grouped = df.groupby('cntry')
Run a sample country
In [38]:
group = grouped.get_group('IS')
run_model(group, formula).summary()
Out[38]:
Run all countries
In [39]:
def run_logits(grouped, formula, var):
for name, group in grouped:
country = '%14.14s' % COUNTRY[name]
model = smf.logit(formula, data=group)
results = model.fit(disp=False)
nobs, param, stars = extract_res(results, var=var)
arrow = '<--' if stars and param > 0 else ''
print(country, nobs, '%0.3g'%param, stars, arrow, sep='\t')
In [40]:
formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
run_logits(grouped, formula, 'netuse_f')
In [41]:
run_logits(grouped, formula, 'hincrank_f')
In [42]:
run_logits(grouped, formula, 'edurank_f')
Run OLS model with rlgdgr
: "Regardless of whether you belong to a particular religion, how religious would you say you are?"
In [43]:
formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
model = smf.ols(formula, data=df)
results = model.fit(disp=False)
results.summary()
Out[43]:
In [44]:
def run_ols(grouped, formula, var):
for name, group in grouped:
model = smf.ols(formula, data=group)
results = model.fit(disp=False)
nobs, param, stars = extract_res(results, var=var)
arrow = '<--' if stars and param > 0 else ''
print(name, len(group), '%0.3g '%param, stars, arrow, sep='\t')
In [45]:
run_ols(grouped, formula, 'netuse_f')
In [46]:
run_ols(grouped, formula, 'edurank_f')
In [47]:
run_ols(grouped, formula, 'hincrank_f')
Let's see what happens if we add quadratic terms for edurank and hincrank:
In [48]:
df['edurank_f2'] = df.edurank_f**2
df['hincrank_f2'] = df.hincrank_f**2
In [49]:
formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + edurank_f2 + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
In [50]:
run_ols(grouped, formula, 'edurank_f')
In [51]:
run_ols(grouped, formula, 'edurank_f2')
In [52]:
formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + edurank_f2 + '
'hincrank_f + hincrank_f2 + '
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
In [53]:
run_ols(grouped, formula, 'hincrank_f')
In [54]:
run_ols(grouped, formula, 'edurank_f')
In [55]:
run_ols(grouped, formula, 'netuse_f')
In [ ]: