In [53]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 21 14:35:15 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn
import matplotlib.pyplot as plt
import scipy.stats
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)
# Load from CSV
data1 = pd.read_csv('gapminder.csv', skip_blank_lines=True,
usecols=['country','alcconsumption', 'lifeexpectancy'])
# Rename columns for clarity
data1.columns = ['country','alcohol','life']
# Variables Descriptions
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"
# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('alcohol','life') :
data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce')
# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')
# Copy dataframe for univariate categorical variables
data2 = data1.copy()
The third assignment deals with correlation coefficient. A correlation coefficient assesses the degree of linear relationship between two variables. It ranges from +1 to -1. A correlation of +1 means that there is a perfect, positive, linear relationship between the two variables. A correlation of -1 means there is a perfect, negative linear relationship between the two variables. In both cases, knowing the value of one variable, we can perfectly predict the value of the second.
Details of my project can seeing here, to get easier, I made a summary bellow:
Variable Name | Description |
---|---|
Life | Explanatory Variable: Life Expectancy (1) |
Alcohol | Response Varialbe: Alcohol Consumption (2) |
(1) 2008 alcohol consumption per adult (liters, age 15+) (2) 2011 life expectancy at birth (years)
In [65]:
r1 = scipy.stats.pearsonr(data1['life'], data1['alcohol'])
r1 = list(r1)
r1.insert(2,r1[0]*r1[0])
print (tabulate([r1], tablefmt="fancy_grid",
headers=['Correlation coefficient', 'P-value', 'r²'] ))
The correlation is approximately 0.31 with a very small p-value, this indicate that the realationship is statistically signficant.
In [55]:
# basic scatterplot Q->Q
scat1 = seaborn.regplot(x="alcohol", y="life", fit_reg=True, data=data1)
plt.xlabel('Alcohol Consumption')
plt.ylabel('Life Expectancy')
plt.title('Scatterplot for the Association Between Life Expectancy and Alcohol Consumption')
plt.show()