In [37]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 13 12:35:39 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 

# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

# Load from CSV
data1 = pd.read_csv('gapminder.csv', skip_blank_lines=True,
                    usecols=['country','alcconsumption', 'lifeexpectancy'])
 
# Rename columns for clarity                                    
data1.columns = ['country','alcohol','life']

# Variables Descriptions
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"

# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('alcohol','life') :
   data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce') 

# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')

# Copy dataframe for univariate categorical variables
data2 = data1.copy()

In [38]:
# create categorical variable for life expctancy divided in five ranges 

alcohol_map = {1: '>=0 <5', 2: '>=5 <10', 3: '>=10 <15', 4: '>=15 <20', 5: '>=20 <25'}
data2['alcohol'] = pd.cut(data1.alcohol,[0,5,10,15,20,25], labels=['1','2','3','4','5'])

data2["alcohol"] = data2["alcohol"].astype('category')
data2["alcohol"] = data2["alcohol"].cat.rename_categories([i for i in alcohol_map.values()])

In [39]:
# using ols function for calculating the F-statistic and associated p value
model1 = smf.ols(formula='life ~ C(alcohol)', data=data2)
results1 = model1.fit()
print (results1.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   life   R-squared:                       0.125
Model:                            OLS   Adj. R-squared:                  0.105
Method:                 Least Squares   F-statistic:                     6.112
Date:                Sat, 13 Aug 2016   Prob (F-statistic):           0.000128
Time:                        20:41:12   Log-Likelihood:                -639.68
No. Observations:                 176   AIC:                             1289.
Df Residuals:                     171   BIC:                             1305.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------
Intercept                 65.9337      1.060     62.212      0.000        63.842    68.026
C(alcohol)[T.>=5 <10]      3.6651      1.625      2.255      0.025         0.457     6.873
C(alcohol)[T.>=10 <15]     9.5628      1.978      4.834      0.000         5.658    13.468
C(alcohol)[T.>=15 <20]     5.6221      3.126      1.798      0.074        -0.548    11.793
C(alcohol)[T.>=20 <25]     3.3833      9.360      0.361      0.718       -15.093    21.860
==============================================================================
Omnibus:                       16.670   Durbin-Watson:                   1.863
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.421
Skew:                          -0.804   Prob(JB):                     6.06e-05
Kurtosis:                       2.746   Cond. No.                         14.4
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [40]:
means = [data2[data2.alcohol=='>=0 <5'].mean(),
          data2[data2.alcohol=='>=5 <10'].mean(),
          data2[data2.alcohol=='>=10 <15'].mean(), 
          data2[data2.alcohol=='>=15 <20'].mean(),
          data2[data2.alcohol=='>=20 <25'].mean() ]

print (tabulate([means], tablefmt="fancy_grid", headers=[i for i in alcohol_map.values()]))


╒══════════╤═══════════╤════════════╤════════════╤════════════╕
│   >=0 <5 │   >=5 <10 │   >=10 <15 │   >=15 <20 │   >=20 <25 │
╞══════════╪═══════════╪════════════╪════════════╪════════════╡
│  65.9337 │   69.5987 │    75.4965 │    71.5558 │     69.317 │
╘══════════╧═══════════╧════════════╧════════════╧════════════╛

In [41]:
stds = [data2[data2.alcohol=='>=0 <5'].std(),
         data2[data2.alcohol=='>=5 <10'].std(),
         data2[data2.alcohol=='>=10 <15'].std(), 
         data2[data2.alcohol=='>=15 <20'].std(),
         data2[data2.alcohol=='>=20 <25'].std() ]

print (tabulate([stds], tablefmt="fancy_grid", headers=[i for i in alcohol_map.values()]))


╒══════════╤═══════════╤════════════╤════════════╤════════════╕
│   >=0 <5 │   >=5 <10 │   >=10 <15 │   >=15 <20 │   >=20 <25 │
╞══════════╪═══════════╪════════════╪════════════╪════════════╡
│  9.16795 │   10.4934 │    7.67605 │    7.20923 │        nan │
╘══════════╧═══════════╧════════════╧════════════╧════════════╛

In [42]:
mc1 = multi.MultiComparison(data2['life'], data2['alcohol'])
res1 = mc1.tukeyhsd()
print(res1.summary())


Multiple Comparison of Means - Tukey HSD,FWER=0.05
==================================================
 group1   group2  meandiff  lower    upper  reject
--------------------------------------------------
 >=0 <5  >=10 <15  9.5628   4.1087   15.017  True 
 >=0 <5  >=15 <20  5.6221  -2.9969  14.2411 False 
 >=0 <5  >=20 <25  3.3833  -22.4241 29.1908 False 
 >=0 <5  >=5 <10   3.6651  -0.8153   8.1454 False 
>=10 <15 >=15 <20 -3.9407  -13.2658  5.3844 False 
>=10 <15 >=20 <25 -6.1795  -32.2313 19.8723 False 
>=10 <15 >=5 <10  -5.8978   -11.62  -0.1755  True 
>=15 <20 >=20 <25 -2.2388  -29.1318 24.6542 False 
>=15 <20 >=5 <10  -1.9571  -10.7482  6.834  False 
>=20 <25 >=5 <10   0.2817  -25.5837 26.1472 False 
--------------------------------------------------