In [65]:
# -*- coding: utf-8 -*-
"""
Created on Sun July 17 10:55:29 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn
import matplotlib.pyplot as plt
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)
# Load from CSV
data1 = pd.read_csv('gapminder.csv', skip_blank_lines=True,
usecols=['country','alcconsumption', 'lifeexpectancy'])
# Rename columns for clarity
data1.columns = ['country','alcohol','life']
# Variables Descriptions
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"
# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('alcohol','life') :
data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce')
# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')
# Copy dataframe for univariate categorical variables
data2 = data1.copy()
In [66]:
#Univariate histogram for alcohol consumption:
seaborn.distplot(data1["alcohol"].dropna(), kde=False);
plt.xlabel('alcohol consumption (liters)')
plt.title(ALCOHOL)
plt.show()
In [67]:
desc1 = data1["alcohol"].describe()
print (tabulate( [desc1], tablefmt="fancy_grid", headers=[i for i in desc1.index]))
In [68]:
# Univariate histogram for Life Expectancy:
seaborn.distplot(data1["life"].dropna(), kde=False);
plt.xlabel('life expectancy')
plt.title(LIFE)
plt.show()
In [69]:
desc1 = data1["life"].describe()
print (tabulate( [desc1], tablefmt="fancy_grid", headers=[i for i in desc1.index]))
In [70]:
# basic scatterplot Q->Q
scat1 = seaborn.regplot(x="alcohol", y="life", fit_reg=True, data=data1)
plt.xlabel('Alcohol Consumption')
plt.ylabel('Life Expectancy')
plt.title('Scatterplot for the Association Between Alcohol Consumption and Life Expectancy')
plt.show()
In [71]:
# univariate bar graph for categorical variable life
life_map = {1: '>=40 <50', 2: '>=50 <60', 3: '>=60 <70', 4: '>=70 <80', 5: '>=80 <90'}
data2['life'] = pd.cut(data1.life,[40,50,60,70,80,90], labels=['1','2','3','4','5'])
data2["life"] = data2["life"].astype('category')
data2["life"] = data2["life"].cat.rename_categories([i for i in life_map.values()])
seaborn.countplot(x="life", data=data2)
plt.xlabel('Life Expectancy')
plt.title(LIFE)
plt.show()
In [72]:
# univariate bar graph for categorical variable alcohol
alcohol_map = {1: '>=0 <5', 2: '>=5 <10', 3: '>=10 <15', 4: '>=15 <20', 5: '>=20 <25'}
data2['alcohol'] = pd.cut(data1.alcohol,[0,5,10,15,20,25], labels=['1','2','3','4','5'])
data2["alcohol"] = data2["alcohol"].astype('category')
data2["alcohol"] = data2["alcohol"].cat.rename_categories([i for i in alcohol_map.values()])
seaborn.countplot(x="alcohol", data=data2)
plt.xlabel('Alcohol Consumption')
plt.title(ALCOHOL)
plt.show()
In [73]:
# Just for test
gt0lt5 = data2[(data2['alcohol']==">=0 <5")].count()
print (tabulate([gt0lt5], tablefmt="fancy_grid", headers=([i for i in gt0lt5.index])))
In [74]:
# bivariate bar graph C->C
data2['life'] = data1['life']
seaborn.factorplot(x='alcohol', y='life', data=data2, kind="bar", ci=None)
plt.xlabel('Alcohol Consumption')
plt.ylabel('Life Expectancy')
plt.title(ALCOHOL)
plt.show()
In [ ]: