This notebook presents explorations of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org).
Copyright 2015 Allen Downey
MIT License: http://opensource.org/licenses/MIT
In [100]:
from __future__ import print_function, division
import string
import random
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import thinkstats2
import thinkplot
import matplotlib.pyplot as plt
import ess
%matplotlib inline
Read all data
In [29]:
%time cycles = ess.read_all_cycles()
Clean the data
In [32]:
for cycle in cycles:
%time ess.clean_cycle(cycle)
Resample
In [33]:
%time cycle_maps = [ess.resample(cycle) for cycle in cycles]
for cycle_map in cycle_maps:
print(len(cycle_map), 'countries')
Remove a few countries from a few cycles where they are missing data.
In [34]:
%time ess.remove_missing(cycle_maps)
Replace income and education with ranks
In [35]:
for cycle_map in cycle_maps:
%time ess.replace_with_ranks(cycle_map)
Fill missing values
In [36]:
for cycle_map in cycle_maps:
%time ess.fill_vars_by_country(cycle_map)
Contatenate the countries within each cycle
In [38]:
%time dfs = [ess.concat_groups(cycle_map) for cycle_map in cycle_maps]
for df in dfs:
print(len(df))
Contatenate the cycles
In [39]:
%time df = pd.concat(dfs, ignore_index=True)
print(df.shape)
Group by country
In [15]:
grouped = df.groupby('cntry')
len(grouped)
Out[15]:
The previous steps are now in two functions. The first reads and cleans the data:
In [40]:
%time cycles = ess.read_and_clean()
The second resamples and fills
In [44]:
%time df = ess.resample_and_fill(cycles)
I run the resampling process a few hundred times and store the results in HDF
In [45]:
store = pd.HDFStore('ess.resamples.h5')
It's pretty fast, considering the size of these DataFrames.
In [46]:
%time store.put('df', df)
In [48]:
%time df = store.get('df')
I store the dataframe using random keys.
In [91]:
def random_name():
t = [random.choice(string.ascii_letters) for i in range(6)]
return ''.join(t)
random_name()
Out[91]:
Add dataframes to the store
In [378]:
def add_frames(n):
for i in range(n):
name = random_name()
print(name)
df = ess.resample_and_fill(cycles)
store.put(name, df)
How many dataframes in the store?
In [379]:
keys = store.keys()
len(keys)
Out[379]:
Pick one at random
In [199]:
key = random.choice(keys)
df = store.get(key)
And run the logit model
In [201]:
formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
res = ess.run_model(df, formula)
res.summary()
Out[201]:
I use Country objects to collect data associated with each country.
In [318]:
class Country:
def __init__(self, code, nobs):
self.code = code
self.name = ess.country_name(code)
self.nobs = nobs
self.mean_map = {}
self.param_map = {}
def add_mean(self, means):
self.mean_seq.append(means)
def add_params(self, params):
self.param_seq.append(params)
def add_params2(self, params):
self.param2_seq.append(params)
def get_means(self, varname):
t = [mean[varname] for mean in self.mean_seq]
return np.array(t)
def get_params(self, varname):
t = [params[varname] for params in self.param_seq]
return np.array(t)
def get_params2(self, varname):
t = [params[varname] for params in self.param2_seq]
return np.array(t)
Make the country objects
In [290]:
keys = store.keys()
key = random.choice(keys)
df = store.get(key)
grouped = df.groupby('cntry')
country_map = {}
for code, group in grouped:
country_map[code] = Country(code, len(group))
print(country_map[code].name)
For each resampled frame, run both models and store the results in the Country objects
In [291]:
formula1 = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + '
'edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
formula2 = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + '
'edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
def process_frame(df):
grouped = df.groupby('cntry')
for code, group in grouped:
country = country_map[code]
country.add_mean(group.mean())
model = smf.logit(formula1, data=group)
results = model.fit(disp=False)
country.add_params(results.params)
model = smf.ols(formula2, data=group)
results = model.fit(disp=False)
country.add_params2(results.params)
Process the dataframes in the store
In [292]:
for key in store.keys():
print(key)
df = store.get(key)
process_frame(df)
Check a country
In [294]:
varname = 'netuse_f'
country = country_map['BE']
params = country.get_params2(varname)
len(params)
Out[294]:
Extract the given parameters for each country
In [380]:
def extract_params(country_map, param_func, varname):
"""Extracts parameters.
country_map: map from country code to Country
param_func: function that takes country and returns param list
varname: name of variable to get the mean of
returns: list of (code, name, param, low, high, mean) tuple
"""
t = []
for code, country in sorted(country_map.items()):
name = country.name
params = param_func(country)
param = np.median(params)
low = np.percentile(params, 2.5)
high = np.percentile(params, 97.5)
means = country.get_means(varname)
mean = np.median(means)
t.append((code, name, param, low, high, mean))
t.sort(key=lambda x: x[2])
return t
In [384]:
def extract_vars(country_map, exp_var, dep_var):
def param_func(country):
return country.get_params(exp_var)
t = extract_params(country_map, param_func, dep_var)
return t
def extract_vars2(country_map, exp_var, dep_var):
def param_func(country):
return country.get_params2(exp_var)
t = extract_params(country_map, param_func, dep_var)
return t
Plot confidence intervals for estimated parameters
In [320]:
def plot_cis(t):
plt.figure(figsize=(8,8))
n = len(t)
ys = n - np.arange(n)
codes, names, params, lows, highs, means = zip(*t)
plt.hlines(ys, lows, highs, color='blue', linewidth=2, alpha=0.5)
plt.plot(params, ys, 'ws', markeredgewidth=0, markersize=15)
for param, y, code in zip(params, ys, codes):
plt.text(param, y, code, fontsize=10, color='blue',
horizontalalignment='center',
verticalalignment='center')
plt.vlines(0, 0, n+1, color='gray', alpha=0.5)
plt.yticks(ys, names)
In [469]:
plot_counter = 1
def save_plot(flag=False):
global plot_counter
if flag:
root = 'ess3.%2.2d' % plot_counter
thinkplot.Save(root=root, formats=['png'])
plot_counter += 1
Make a plot showing confidence interval for the given parameters
In [470]:
t = extract_vars(country_map, 'inwyr07_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Interview year',
xlabel='log odds ratio interview year')
save_plot()
In [471]:
t = extract_vars(country_map, 'yrbrn60_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Year born',
xlabel='log odds ratio year born')
save_plot()
In [472]:
t = extract_vars(country_map, 'edurank_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Education rank',
xlabel='log odds ratio education rank')
save_plot()
In [473]:
t = extract_vars(country_map, 'hincrank_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Income rank',
xlabel='log odds ratio income rank')
save_plot()
In [474]:
t = extract_vars(country_map, 'tvtot_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Television',
xlabel='log odds ratio television')
save_plot()
In [475]:
t = extract_vars(country_map, 'rdtot_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Radio',
xlabel='log odds ratio radio')
save_plot()
In [476]:
t = extract_vars(country_map, 'nwsptot_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Hours of newspaper per week',
xlabel='log odds ratio per hour newspaper')
save_plot()
In [477]:
t = extract_vars(country_map, 'netuse_f', 'hasrelig_f')
plot_cis(t)
thinkplot.Config(title='Internet',
xlabel='log odds ratio Internet',
xlim=[-0.15, 0.1])
save_plot()
Make a scatter plot of fraction who have religion versus odds ratio of netuse.
In [478]:
def plot_scatter(t):
plt.figure(figsize=(8,8))
codes, names, params, lows, highs, means = zip(*t)
for param, mean, code in zip(params, means, codes):
plt.text(param, mean, code, fontsize=10, color='blue',
horizontalalignment='center',
verticalalignment='center')
corr = np.corrcoef(params, means)[0][1]
print(corr)
In [479]:
t = extract_vars(country_map, 'netuse_f', 'hasrelig_f')
plot_scatter(t)
thinkplot.Config(title='',
xlabel='log odds ratio Internet',
ylabel='fraction with affiliation',
xlim=[-0.15, 0.05])
save_plot()
Make similar figures for the second model, with degree of religiosity as the dependent variable.
In [480]:
t = extract_vars2(country_map, 'inwyr07_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Interview year',
xlabel='linear coefficient interview year')
save_plot()
In [481]:
t = extract_vars2(country_map, 'yrbrn60_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Year born',
xlabel='linear coefficient year born')
save_plot()
In [482]:
t = extract_vars2(country_map, 'edurank_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Education rank',
xlabel='linear coefficient educational rank')
save_plot()
In [483]:
t = extract_vars2(country_map, 'hincrank_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Income rank',
xlabel='linear coefficient income rank')
save_plot()
In [484]:
t = extract_vars2(country_map, 'tvtot_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Television',
xlabel='linear coefficient television')
save_plot()
In [485]:
t = extract_vars2(country_map, 'rdtot_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Radio',
xlabel='linear coefficient radio')
save_plot()
In [486]:
t = extract_vars2(country_map, 'nwsptot_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Newspaper',
xlabel='linear coefficient newspaper')
save_plot()
In [487]:
t = extract_vars2(country_map, 'netuse_f', 'rlgdgr_f')
plot_cis(t)
thinkplot.Config(title='Internet',
xlabel='linear coefficient Internet')
save_plot()
Here's the scatter plot of effect size on rlgdgr versus mean value of rlgdgr
rlgdgr is on a 0 to 10 scale, so it is mildly astonishing that national means vary as much as they do, from 2.5 to 7.
In [488]:
t = extract_vars2(country_map, 'netuse_f', 'rlgdgr_f')
plot_scatter(t)
thinkplot.Config(title='',
xlabel='log odds ratio Internet',
ylabel='degree of religosity',
xlim=[-0.35, 0.1], ylim=[1, 8])
save_plot()
The correlation is dragged down by the outliers; without them, it is more like we saw above.
In [489]:
codes, names, params, lows, highs, means = zip(*t[2:])
corr = np.corrcoef(params, means)[0][1]
print(corr)
The following is preliminary work on the next step, showing effect sizes in terms of probability of religious affiliation.
In [490]:
df.head()
Out[490]:
In [491]:
grouped = df.groupby('cntry')
group = grouped.get_group('NL')
In [492]:
formula1 = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + '
'edurank_f + hincrank_f +'
'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
model = smf.logit(formula1, data=group)
results = model.fit(disp=False)
mean = group.mean()
low_net = np.percentile(group['netuse_f'], 25)
high_net = np.percentile(group['netuse_f'], 75)
def prob_hasrelig(results, df):
return results.predict(mean)[0]
print(mean['hasrelig_f'])
print(low_net, high_net)
print(prob_hasrelig(results, mean))
mean.netuse_f = low_net
print(prob_hasrelig(results, mean))
mean.netuse_f = high_net
print(prob_hasrelig(results, mean))
In [ ]: