In [273]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 13 12:35:39 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn
import matplotlib.pyplot as plt
import scipy.stats

# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

# Load from CSV
data1 = pd.read_csv('gapminder.csv', skip_blank_lines=True,
                    usecols=['country','alcconsumption', 'lifeexpectancy'])
 
# Rename columns for clarity                                    
data1.columns = ['country','alcohol','life']

# Variables Descriptions
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"

# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('alcohol','life') :
   data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce') 

# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')

# Copy dataframe for univariate categorical variables
data2 = data1.copy()

hypothesis

Test the hypothesis about alcohol consumption and life expectancy Specifically, is how quantity litters alcohol consumption per year in a country is related a life expectancy, or in hypothesis testing terms, is the quantity of alcohol consumption and life expectancy is independent or dependent? For this analysis, I'm going to use a categorical explanatory variable with five levels, with the following categorical values: Alcohol consumption (per year, in liters) from 0 to 5, from 5 to 10, from 10 to 15, from 15 to 20 and from 20 to 25

My response variable is categorical with 2 levels. That is, life expectancy greater than or less than the mean of all countries in gapmind data set.


In [274]:
''' Categoriacal explanatory variable with five levels '''
alcohol_map = {1: '>=0 <5', 2: '>=5 <10', 3: '>=10 <15', 4: '>=15 <20', 5: '>=20 <25'}
data2['alcohol'] = pd.cut(data1.alcohol,[0,5,10,15,20,25], 
                          labels=[i for i in alcohol_map.values()])
data2["alcohol"] = data2["alcohol"].astype('category')

In [275]:
# Mean, Min and Max of life expectancy
meal = data2.life.mean()
minl = data2.life.min() 
maxl = data2.life.max()
print (tabulate([[np.floor(minl), meal, np.ceil(maxl)]], 
                tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))


╒═══════╤═════════╤═══════╕
│   Min │    Mean │   Max │
╞═══════╪═════════╪═══════╡
│    47 │ 69.1437 │    84 │
╘═══════╧═════════╧═══════╛

In [276]:
# Create categorical response variable life (Two levels based on mean)
life_map = {1: '<=69', 2: '>69'}
data2['life'] = pd.cut(data1.life,[np.floor(minl),meal,np.ceil(maxl)], labels=[i for i in life_map.values()])
data2["life"] = data2["life"].astype('category')

In [277]:
# contingency table of observed counts
ct1=pd.crosstab(data2['life'], data2['alcohol'])
headers_alcohol = [i for i in ct1.keys()]
headers_alcohol.insert(0,'life/alcool')
print (tabulate(ct1,tablefmt="fancy_grid",headers=headers_alcohol))


╒═══════════════╤══════════╤═══════════╤════════════╤════════════╤════════════╕
│ life/alcool   │   >=0 <5 │   >=5 <10 │   >=10 <15 │   >=15 <20 │   >=20 <25 │
╞═══════════════╪══════════╪═══════════╪════════════╪════════════╪════════════╡
│ <=69          │       44 │        20 │          4 │          3 │          0 │
├───────────────┼──────────┼───────────┼────────────┼────────────┼────────────┤
│ >69           │       33 │        37 │         27 │          7 │          1 │
╘═══════════════╧══════════╧═══════════╧════════════╧════════════╧════════════╛

Examining these column percents for those with life expectancy (greater or less than mean) we see that as alcohol consumption (from 5) until 15 liters per year increase, the life expectancy also increase.


In [278]:
''' 
Generate the column percentages wich show the percent of individuals with level of
live expectancy within each alcohol consumption level.
'''
colsum = ct1.sum(axis=0)
colpct = ct1/colsum
headers = [i for i in colpct.keys()]
headers.insert(0,'Life/Alcohol')
print (tabulate(colpct, tablefmt="fancy_grid", headers=headers, floatfmt=".3f"))


╒════════════════╤══════════╤═══════════╤════════════╤════════════╤════════════╕
│ Life/Alcohol   │   >=0 <5 │   >=5 <10 │   >=10 <15 │   >=15 <20 │   >=20 <25 │
╞════════════════╪══════════╪═══════════╪════════════╪════════════╪════════════╡
│ <=69           │    0.571 │     0.351 │      0.129 │      0.300 │      0.000 │
├────────────────┼──────────┼───────────┼────────────┼────────────┼────────────┤
│ >69            │    0.429 │     0.649 │      0.871 │      0.700 │      1.000 │
╘════════════════╧══════════╧═══════════╧════════════╧════════════╧════════════╛

In [279]:
'''
compute the expected frequencies for the table based on the marginal sums
under the assumption that the groups associated with each dimension are independent.
'''
print ('expected frequencies.')
print  (tabulate(scipy.stats.contingency.expected_freq(ct1), 
                 tablefmt="fancy_grid", floatfmt=".3f"))


expected frequencies.
╒════════╤════════╤════════╤═══════╤═══════╕
│ 31.062 │ 22.994 │ 12.506 │ 4.034 │ 0.403 │
├────────┼────────┼────────┼───────┼───────┤
│ 45.938 │ 34.006 │ 18.494 │ 5.966 │ 0.597 │
╘════════╧════════╧════════╧═══════╧═══════╛

In [280]:
'''
Graph the percent of population with life expectancy greather 
than mean (69.14) within each consumption alcohol category
'''

# Create categorical response variable life (numeric) (Two levels based on mean)
data2['life_n'] = data2.life
life_map = {1: 0, 2: 1}
data2['life_n'] = pd.cut(data1.life,[np.floor(minl),meal,np.ceil(maxl)], 
                         labels=[i for i in life_map.values()])
#data2["life_n"] = data2["life_n"].astype('category')

data2.life_n = pd.to_numeric(data2.life_n)

seaborn.factorplot(x='alcohol', y='life_n', data=data2, kind='bar', ci=None)
plt.xlabel('Alcohol consumption')
plt.ylabel('Life Expectancy')


Out[280]:
<matplotlib.text.Text at 0x7f530e1be198>

The analysis of this graph without other analysis (like the frequencies), maybe can lead an error. It seems to show that the most country with the life expectancy greater than the mean are those that alcohol consumption is in the range between 20 and 25 liters. To help solve this issue, I used the countplot seaborn function, that is "A special case for the bar plot is when you want to show the number of observations in each category rather than computing a statistic for a second variable. This is similar to a histogram over a categorical, rather than quantitative, variable" Se here.

On this graph is easy to see that only 1 observation was realized in the column >=20 <=25.


In [281]:
seaborn.countplot(x='alcohol', data=data2, palette='Greens_d')


Out[281]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f530e2b4c88>

In [282]:
'''Chi-square calculations, wich include the chi-square value, the associated
p-vale, and a table of expected counts that ares used in these calculations.'''

cs1 = scipy.stats.chi2_contingency(ct1)
results = OrderedDict()
results['chi-square'] = cs1[0] 
results['p-value'] = cs1[1]
results['df'] = cs1[2]
print (tabulate([results.values()], tablefmt="fancy_grid", 
                headers=[i for i in results.keys()]))

print ('\nThe expected frequencies, based on the marginal sums of the table.')
print (tabulate(cs1[3]))


╒══════════════╤═════════════╤══════╕
│   chi-square │     p-value │   df │
╞══════════════╪═════════════╪══════╡
│      20.5031 │ 0.000397211 │    4 │
╘══════════════╧═════════════╧══════╛

The expected frequencies, based on the marginal sums of the table.
-------  -------  -------  -------  --------
31.0625  22.9943  12.5057  4.03409  0.403409
45.9375  34.0057  18.4943  5.96591  0.596591
-------  -------  -------  -------  --------

Post hoc Bonferroni Adjustment

Looking at the significant P value, we will accept the alternate Hypothesis, where not all life expectancy rates are equal across alcohol consumption categories. If my explanatory variable had only two levels, I could interpret the two corresponding column percentages and be able to say wich group had a significantly higher rate of life expectancy. But my explanatory variable has five categories. So I know that not all are equal. But I don't know wich are different and wich are not.


In [283]:
''' 
Post hoc Bonferroni Adjustment
On Bonferroni Adjustment p value had adjusted dividing p 0.05 by 
the number of comparisions that we plan to make
'''
p_for_reject_h0 = .05/10

pairs = []
pairs.append((('>=0 <5','>=5 <10'),[0,5,5,10]))
pairs.append((('>=0 <5','>=10 <15'),[0,5,10,15]))
pairs.append((('>=0 <5','>=15 <20'),[0,5,15,20]))
pairs.append((('>=0 <5','>=20 <25'),[0,5,20,25]))
pairs.append((('>=5 <10','>=10 <15'),[5,10,10,15]))
pairs.append((('>=5 <10','>=15 <20'),[5,10,15,20]))
pairs.append((('>=5 <10','>=20 <25'),[5,10,20,25]))
pairs.append((('>=10 <15','>=15 <20'),[10,15,15,20]))
pairs.append((('>=10 <15','>=20 <25'),[10,15,20,25]))
pairs.append((('>=15 <20','>=20 <25'),[15,20,20,25]))

data_pairs = []
results=[]
for pair in pairs:
    data_pair = data2[ (((data1.alcohol>pair[1][0]) & (data1.alcohol<=pair[1][1])) | 
                  ((data1.alcohol>pair[1][2]) & (data1.alcohol<=pair[1][3])))]
        
    ct0=pd.crosstab( data_pair['life'], data_pair['alcohol'])
    ct1 = ct0[ [pair[0][0],pair[0][1]] ]
    cs0 = scipy.stats.chi2_contingency(ct1)
    #print (ct0)
    
    # chi-square, p=value and degree of freedom
    reject = 'yes' if (cs0[1] < .05/10) else 'no' 
    results.append((pair[0],cs0[0],cs0[1],cs0[2], reject))
 
print (tabulate(results, tablefmt="fancy_grid", 
                headers=['groups', 'chi-square', 'p-value', 'df', 'h0 reject'] ))


╒══════════════════════════╤══════════════╤═════════════╤══════╤═════════════╕
│ groups                   │   chi-square │     p-value │   df │ h0 reject   │
╞══════════════════════════╪══════════════╪═════════════╪══════╪═════════════╡
│ ('>=0 <5', '>=5 <10')    │   5.53236    │ 0.0186679   │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=0 <5', '>=10 <15')   │  15.773      │ 7.14144e-05 │    1 │ yes         │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=0 <5', '>=15 <20')   │   1.64614    │ 0.199485    │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=0 <5', '>=20 <25')   │   0.0169283  │ 0.89648     │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=5 <10', '>=10 <15')  │   3.92657    │ 0.0475288   │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=5 <10', '>=15 <20')  │   0.00235204 │ 0.96132     │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=5 <10', '>=20 <25')  │   0.108449   │ 0.741917    │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=10 <15', '>=15 <20') │   0.586965   │ 0.443595    │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=10 <15', '>=20 <25') │   1.32719    │ 0.249306    │    1 │ no          │
├──────────────────────────┼──────────────┼─────────────┼──────┼─────────────┤
│ ('>=15 <20', '>=20 <25') │   0.286458   │ 0.592499    │    1 │ no          │
╘══════════════════════════╧══════════════╧═════════════╧══════╧═════════════╛

Conclusion

Although in the first chi-square calculus the p-value indicated that the null hypothesis could be rejected (p-value < .05), the table above, resulting of Post hoc Bonferroni Adjustment, where the chi2 is calculated for all group each against each other (two by two), show us that the only comparison that we can reject the null hypothesis is >=0 < 5 and >=10 < 15 suggesting that if exists correlation between life expectancy and alcohol consumption is among the countries that have the levels of alcohol consumption between 0 TO 5 and 10 and 15 liters.


In [ ]: