Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.
For Tables reference see http://data8.org/datascience/tables.html
In [1]:
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=1, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__
import sys
sys.path.append("..")
from ml_table import ML_Table
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
Out[1]:
In [2]:
raw_credit = ML_Table.read_table("data/Credit.csv")
credit = raw_credit.drop('Unnamed: 0')
credit
Out[2]:
In [3]:
credit['Female'] = credit.apply(lambda x: 1 if x=='Female' else 0, 'Gender')
credit
Out[3]:
In [4]:
credit.regression_1d_params('Balance', 'Female')
Out[4]:
In [5]:
credit.pivot_hist('Gender', 'Balance')
In [6]:
credit.plot_fit_1d('Balance', 'Female', credit.linear_regression('Balance', 'Female').model)
Out[6]:
In [7]:
credit.lm_summary_1d('Balance', 'Female')
Out[7]:
In [8]:
credit['GenderDif'] = credit.apply(lambda x: 1 if x=='Female' else -1, 'Gender')
credit
Out[8]:
In [9]:
credit.lm_summary_1d('Balance', 'GenderDif')
Out[9]:
In [10]:
credit.pivot_hist('Ethnicity', 'Balance')
In [11]:
credit.pivot_hist('Ethnicity', 'Balance', normed=True)
In [12]:
credit['Ethnicity[Asian]'] = credit.apply(lambda x: 1 if x == 'Asian' else 0, 'Ethnicity')
credit['Ethnicity[Caucasian]'] = credit.apply(lambda x: 1 if x == 'Caucasian' else 0, 'Ethnicity')
eth_credit_balance = credit.select(['Balance', 'Ethnicity[Asian]', 'Ethnicity[Caucasian]'])
eth_credit_balance.linear_regression('Balance').summary()
Out[12]:
In [13]:
eth_model = eth_credit_balance.linear_regression('Balance').model
eth_credit_balance.lm_fit('Balance', eth_model)
Out[13]:
In [14]:
advertising = ML_Table.read_table("./data/Advertising.csv")
advertising = advertising.drop(0)
advertising['TVxRadio'] = advertising['TV']*advertising['Radio']
advertising
Out[14]:
In [15]:
advertising.linear_regression('Sales').summary()
Out[15]:
In [16]:
advertising.lm_fit('Sales', advertising.linear_regression('Sales').model)
Out[16]:
The R2 for the model is 96.8 %, compared to only 89.7% for the model that predicts sales using TV and radio without an interaction term. This means that (96.8 − 89.7)/(100 − 89.7) = 69% of the variability in sales that remains after fitting the additive model has been explained by the interaction term.
The hierarchical principle states that if we include an interaction in a model, we should also include the main effects, even if the p-values associated with their coefficients are not significant.
In [17]:
credit['StudentFac'] = credit.apply(lambda x: 1 if x == 'Yes' else 0, 'Student')
credit
Out[17]:
In [18]:
credit.scatter('Income', select='Balance')
In [19]:
credit.plot_fit_1d('Balance', 'Income', credit.regression_1d('Balance', 'Income'), height=6)
Out[19]:
In [20]:
credit.lm_summary_1d('Balance', 'Income')
Out[20]:
In [21]:
student_credit = credit.where('Student', 'Yes')
student_credit.plot_fit_1d('Balance', 'Income', student_credit.regression_1d('Balance', 'Income'), height=6)
student_credit.lm_summary_1d('Balance', 'Income')
Out[21]:
In [22]:
nostudent_credit = credit.where('Student', 'No')
nostudent_credit.plot_fit_1d('Balance', 'Income', nostudent_credit.regression_1d('Balance', 'Income'), height=6)
nostudent_credit.lm_summary_1d('Balance', 'Income')
Out[22]:
In [23]:
credit_stu_fac = credit.select(['Balance', 'Income', 'StudentFac'])
credit_stu_fac.linear_regression('Balance').summary()
Out[23]:
In [24]:
credit_stu_fac['StudentIncome'] = credit_stu_fac['StudentFac']*credit_stu_fac['Income']
credit_stu_fac.linear_regression('Balance').summary()
Out[24]:
"We note that the slope for students is lower than the slope for non-students. This suggests that increases in income are associated with smaller increases in credit card balance among students as compared to non-students."
But, that conclusion comes from only looking at the coefficients. There really isn't statistical evidence of this in the data set.
In [25]:
raw_auto = ML_Table.read_table("data/Auto.csv")
auto = raw_auto.where(raw_auto['horsepower'] != '?')
auto['horsepower'] = auto.apply(int, 'horsepower')
In [26]:
auto_model_1 = auto.regression_1d('mpg', 'horsepower')
auto.plot_fit_1d('mpg', 'horsepower', auto_model_1)
Out[26]:
In [27]:
auto['HP^2'] = auto['horsepower']**2
In [28]:
auto
Out[28]:
In [29]:
auto_mpg = auto.select(['mpg', 'horsepower', 'HP^2'])
auto_mpg.linear_regression('mpg').summary()
Out[29]:
In [30]:
auto_model_2 = auto_mpg.linear_regression('mpg').model
auto_mpg.plot_fit('mpg', auto_model_2, width=8, height=6)
Out[30]:
In [31]:
auto_mpg['Fit1'] = auto_mpg.apply(auto_model_1, 'horsepower')
auto_mpg['Residual 1'] = auto_mpg['mpg'] - auto_mpg['Fit1']
auto_mpg['Fit2'] = auto_mpg.apply(auto_model_2, ['horsepower', 'HP^2'])
auto_mpg['Residual 2'] = auto_mpg['mpg'] - auto_mpg['Fit2']
auto_mpg
Out[31]:
In [32]:
auto_mpg.scatter('horsepower', ['Residual 1', 'Residual 2'], overlay=False)
In [33]:
auto_mpg.plot_fit_1d('Residual 1', 'Fit1', auto_mpg.poly('Residual 1', 'Fit1', 2), connect=False)
Out[33]:
In [34]:
auto_mpg.plot_fit_1d('Residual 2', 'Fit2', auto_mpg.poly('Residual 2', 'Fit2', 2), connect=False)
Out[34]:
In [35]:
credit['Age Leverage'] = credit.leverage_1d('Age')
credit.scatter('Age', 'Age Leverage')
In [36]:
credit.scatter('Limit', 'Age')
In [37]:
credit.scatter('Limit', 'Rating')
In [38]:
credit_al = credit.select(['Balance', 'Limit', 'Age'])
credit_al_model = credit_al.linear_regression('Balance').model
credit_al.linear_regression('Balance').summary()
Out[38]:
In [39]:
credit_al.RSS_model('Balance', credit_al_model)
Out[39]:
In [40]:
credit_al.RSS_contour2('Balance', x_sensitivity=0.15, y_sensitivity=1, scale=1/1000000, levels=[21.25, 21.5, 21.8])
Out[40]:
In [41]:
credit_rl = credit.select(['Balance', 'Limit', 'Rating'])
credit_rl_model = credit_rl.linear_regression('Balance').model
credit_rl.linear_regression('Balance').summary()
Out[41]:
In [42]:
credit_rl.lm_fit('Balance', credit_rl_model)
Out[42]:
In [43]:
credit_rl.RSS_model('Balance', credit_rl_model)
Out[43]:
In [44]:
credit_rl.RSS_contour2('Balance', x_sensitivity=10, y_sensitivity=1.5, scale=1/1000000, levels=[22, 25, 30, 50])
Out[44]:
In [45]:
credit.Cor()
Out[45]:
In [ ]: