In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 21 14:35:15 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn as sn
import matplotlib.pyplot as plt
import scipy.stats
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import statsmodels.api as sm
# Variables Descriptions
INCOME = "2010 Gross Domestic Product per capita in constant 2000 US$"
ALCOHOL = "2008 alcohol consumption (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)
# Load from CSV
data0 = pd.read_csv('~/dev/coursera/gapminder.csv', skip_blank_lines=True,
usecols=['country','incomeperperson',
'alcconsumption','lifeexpectancy', 'urbanrate'])
In [2]:
def to_num(list, data):
for dt in list :
data[dt] = pd.to_numeric(data[dt], 'errors=coerce')
return data
In [3]:
# Rename columns for clarity
data0.columns = ['country','income','alcohol','life','urban_rate']
# converting to numeric values and parsing (numeric invalids=NaN)
data0 = to_num( ('income','alcohol','life', 'urban_rate'), data0 )
# Remove rows with nan values
data0 = data0.dropna(axis=0, how='any')
# Copy dataframe for preserv original
data1 = data0.copy()
In [4]:
# Mean, Min and Max of life expectancy
meal = data1.life.mean()
minl = data1.life.min()
maxl = data1.life.max()
print (tabulate([[np.floor(minl), meal, np.ceil(maxl)]],
tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))
In [5]:
# Create categorical response variable life (Two levels based on mean)
data1['life'] = pd.cut(data0.life,[np.floor(minl),meal,np.ceil(maxl)], labels=[0,1])
data1['life'] = data1['life'].astype('category')
In [6]:
# Mean, Min and Max of alcohol
meaa = data1.alcohol.mean()
mina = data1.alcohol.min()
maxa = data1.alcohol.max()
print (tabulate([[np.floor(mina), meaa, np.ceil(maxa)]],
tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))
In [7]:
# Categoriacal explanatory variable (Two levels based on mean)
data1['alcohol'] = pd.cut(data0.alcohol,[np.floor(mina),meaa,np.ceil(maxa)],
labels=[0,1])
data1["alcohol"] = data1["alcohol"].astype('category')
In [8]:
data1 = to_num( ('income','alcohol','life', 'urban_rate'), data1 )
In [9]:
lreg1 = smf.logit(formula = 'life ~ alcohol',data=data1).fit()
print (lreg1.summary())
In [10]:
print ("Odds Ratios")
print (np.exp(lreg1.params))
In [11]:
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))
In [12]:
# Mean, Min and Max of income
meai = data1.income.mean()
mini = data1.income.min()
maxi = data1.income.max()
print (tabulate([[np.floor(mini), meai, np.ceil(maxi)]],
tablefmt="fancy_grid", headers=['Min', 'Mean', 'Max']))
In [13]:
lreg2 = smf.logit(formula = 'life ~ alcohol + income',data=data1).fit()
print (lreg2.summary())
In [14]:
params = lreg2.params
conf = lreg2.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))