In [37]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 13 12:35:39 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)
# Load from CSV
data1 = pd.read_csv('gapminder.csv', skip_blank_lines=True,
usecols=['country','alcconsumption', 'lifeexpectancy'])
# Rename columns for clarity
data1.columns = ['country','alcohol','life']
# Variables Descriptions
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"
# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('alcohol','life') :
data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce')
# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')
# Copy dataframe for univariate categorical variables
data2 = data1.copy()
In [38]:
# create categorical variable for life expctancy divided in five ranges
alcohol_map = {1: '>=0 <5', 2: '>=5 <10', 3: '>=10 <15', 4: '>=15 <20', 5: '>=20 <25'}
data2['alcohol'] = pd.cut(data1.alcohol,[0,5,10,15,20,25], labels=['1','2','3','4','5'])
data2["alcohol"] = data2["alcohol"].astype('category')
data2["alcohol"] = data2["alcohol"].cat.rename_categories([i for i in alcohol_map.values()])
In [39]:
# using ols function for calculating the F-statistic and associated p value
model1 = smf.ols(formula='life ~ C(alcohol)', data=data2)
results1 = model1.fit()
print (results1.summary())
In [40]:
means = [data2[data2.alcohol=='>=0 <5'].mean(),
data2[data2.alcohol=='>=5 <10'].mean(),
data2[data2.alcohol=='>=10 <15'].mean(),
data2[data2.alcohol=='>=15 <20'].mean(),
data2[data2.alcohol=='>=20 <25'].mean() ]
print (tabulate([means], tablefmt="fancy_grid", headers=[i for i in alcohol_map.values()]))
In [41]:
stds = [data2[data2.alcohol=='>=0 <5'].std(),
data2[data2.alcohol=='>=5 <10'].std(),
data2[data2.alcohol=='>=10 <15'].std(),
data2[data2.alcohol=='>=15 <20'].std(),
data2[data2.alcohol=='>=20 <25'].std() ]
print (tabulate([stds], tablefmt="fancy_grid", headers=[i for i in alcohol_map.values()]))
In [42]:
mc1 = multi.MultiComparison(data2['life'], data2['alcohol'])
res1 = mc1.tukeyhsd()
print(res1.summary())