In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 21 14:35:15 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn as sn
import matplotlib.pyplot as plt
import scipy.stats 
import statistics as st
import statsmodels.formula.api as smf

# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

# Load from CSV
data1 = pd.read_csv('~/dev/coursera/gapminder.csv', skip_blank_lines=True,
                    usecols=['country','alcconsumption','lifeexpectancy'])
 
# Rename columns for clarity                                    
data1.columns = ['country','alcohol','life']

# Variables Descriptions
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"

# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('income','life') :
   data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce') 

# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')

# Copy dataframe for univariate categorical variables
data2 = data1.copy()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/home/sidon/opt/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1944             try:
-> 1945                 return self._engine.get_loc(key)
   1946             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'income'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-1-863968792dad> in <module>()
     31 # converting to numeric values and parsing (numeric invalids=NaN)
     32 for dt in ('income','life') :
---> 33    data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce')
     34 
     35 # Remove rows with nan values

/home/sidon/opt/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/home/sidon/opt/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/home/sidon/opt/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/home/sidon/opt/anaconda3/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/sidon/opt/anaconda3/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'income'

In [ ]:
print (data1.alcohol.min(), data1.alcohol.max())
print ('\n', data1.life.min(), data1.life.max())

In [ ]:
sn.jointplot("alcohol", "life", data=data1, kind="reg",
                  xlim=(-2, 25), ylim=(45, 85), color="r", size=7)

In [ ]:
mean = data1.alcohol.mean()
data1['alcohol_center'] = data1.alcohol-mean
measures = [data1.alcohol.mean(), data1.alcohol_center.mean(), 
            data1.alcohol_center.min(), data1.alcohol_center.max(),
            data1.life.min(), data1.life.max()]
headers = ['Mean','Center','cMin', 'cMax', 'Min life', 'Max life']

print (tabulate([measures], tablefmt='grid', headers=headers))

In [ ]:
sn.jointplot("alcohol_center", "life", data=data1, kind="reg",
                  xlim=(-10,17), ylim=(45, 85), color="b", size=7)

In [ ]:
print ("OLS regression model for the association between life expectancy and income level")
reg1 = smf.ols('life ~ alcohol', data=data1).fit()
print (reg1.summary())

In [ ]: