In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 21 14:35:15 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn as sn
import matplotlib.pyplot as plt
import scipy.stats
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 
import statsmodels.api as sm

# Variables Descriptions
INCOME = "2010 Gross Domestic Product per capita in constant 2000 US$"
ALCOHOL = "2008 alcohol consumption (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"

# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

# Load from CSV
data0 = pd.read_csv('~/dev/coursera/gapminder.csv', skip_blank_lines=True,
                     usecols=['country','incomeperperson',
                              'alcconsumption','lifeexpectancy', 'urbanrate'])

In [2]:
def to_num(list, data):
    for dt in list :
        data[dt] = pd.to_numeric(data[dt], 'errors=coerce')
    return data

In [3]:
# Rename columns for clarity                                    
data0.columns = ['country','income','alcohol','life','urban_rate']

# converting to numeric values and parsing (numeric invalids=NaN)
data0 = to_num( ('income','alcohol','life', 'urban_rate'), data0 )

# Remove rows with nan values
data0 = data0.dropna(axis=0, how='any')

# Copy dataframe for preserv original
data1 = data0.copy()

In [4]:
# Mean, Min and Max of life expectancy
meal = data1.life.mean()
minl = data1.life.min() 
maxl = data1.life.max()
print (tabulate([[np.floor(minl), meal, np.ceil(maxl)]], 
                tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))


╒═══════╤═════════╤═══════╕
│   Min │    Mean │   Max │
╞═══════╪═════════╪═══════╡
│    47 │ 69.3857 │    84 │
╘═══════╧═════════╧═══════╛

In [5]:
# Create categorical response variable life (Two levels based on mean)
data1['life'] = pd.cut(data0.life,[np.floor(minl),meal,np.ceil(maxl)], labels=[0,1])
data1['life'] = data1['life'].astype('category')

In [6]:
# Mean, Min and Max of alcohol
meaa = data1.alcohol.mean()
mina = data1.alcohol.min() 
maxa = data1.alcohol.max()
print (tabulate([[np.floor(mina), meaa, np.ceil(maxa)]], 
                tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))


╒═══════╤═════════╤═══════╕
│   Min │    Mean │   Max │
╞═══════╪═════════╪═══════╡
│     0 │ 6.78409 │    24 │
╘═══════╧═════════╧═══════╛

In [7]:
# Categoriacal explanatory variable (Two levels based on mean) 
data1['alcohol'] = pd.cut(data0.alcohol,[np.floor(mina),meaa,np.ceil(maxa)], 
                          labels=[0,1])
data1["alcohol"] = data1["alcohol"].astype('category')

In [8]:
data1 = to_num( ('income','alcohol','life', 'urban_rate'), data1 )

In [9]:
lreg1 = smf.logit(formula = 'life ~ alcohol',data=data1).fit()
print (lreg1.summary())


Optimization terminated successfully.
         Current function value: 0.620828
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   life   No. Observations:                  171
Model:                          Logit   Df Residuals:                      169
Method:                           MLE   Df Model:                            1
Date:                Sat, 08 Oct 2016   Pseudo R-squ.:                 0.08246
Time:                        14:19:59   Log-Likelihood:                -106.16
converged:                       True   LL-Null:                       -115.70
                                        LLR p-value:                 1.252e-05
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.2091      0.205     -1.019      0.308        -0.611     0.193
alcohol        1.4363      0.344      4.178      0.000         0.763     2.110
==============================================================================

In [10]:
print ("Odds Ratios")
print (np.exp(lreg1.params))


Odds Ratios
Intercept   0.811321
alcohol     4.205198
dtype: float64

In [11]:
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']

print (np.exp(conf))


           Lower CI  Upper CI       OR
Intercept  0.542615  1.213092 0.811321
alcohol    2.143658  8.249307 4.205198

In [12]:
# Mean, Min and Max of income
meai = data1.income.mean()
mini = data1.income.min() 
maxi = data1.income.max()
print (tabulate([[np.floor(mini), meai, np.ceil(maxi)]], 
                tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))


╒═══════╤═════════╤═══════╕
│   Min │    Mean │   Max │
╞═══════╪═════════╪═══════╡
│   103 │ 7006.36 │ 52302 │
╘═══════╧═════════╧═══════╛

In [13]:
lreg2 = smf.logit(formula = 'life ~ alcohol + income',data=data1).fit()
print (lreg2.summary())


Optimization terminated successfully.
         Current function value: 0.344413
         Iterations 10
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   life   No. Observations:                  171
Model:                          Logit   Df Residuals:                      168
Method:                           MLE   Df Model:                            2
Date:                Sat, 08 Oct 2016   Pseudo R-squ.:                  0.4910
Time:                        14:19:59   Log-Likelihood:                -58.895
converged:                       True   LL-Null:                       -115.70
                                        LLR p-value:                 2.131e-25
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -2.1615      0.399     -5.421      0.000        -2.943    -1.380
alcohol        0.1801      0.501      0.360      0.719        -0.802     1.162
income         0.0010      0.000      5.150      0.000         0.001     0.001
==============================================================================

Possibly complete quasi-separation: A fraction 0.21 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.

In [14]:
params = lreg2.params
conf = lreg2.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']

print (np.exp(conf))


           Lower CI  Upper CI       OR
Intercept  0.052706  0.251564 0.115147
alcohol    0.448563  3.196217 1.197373
income     1.000648  1.001444 1.001046