Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.
For Tables reference see http://data8.org/datascience/tables.html
In [1]:
    
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=1, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__
import sys
sys.path.append("..")
from ml_table import ML_Table
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
    
    Out[1]:
In [2]:
    
raw_credit = ML_Table.read_table("data/Credit.csv")
credit = raw_credit.drop('Unnamed: 0')
credit
    
    Out[2]:
In [3]:
    
credit['Female'] = credit.apply(lambda x: 1 if x=='Female' else 0, 'Gender')
credit
    
    Out[3]:
In [4]:
    
credit.regression_1d_params('Balance', 'Female')
    
    Out[4]:
In [5]:
    
credit.pivot_hist('Gender', 'Balance')
    
    
In [6]:
    
credit.plot_fit_1d('Balance', 'Female', credit.linear_regression('Balance', 'Female').model)
    
    Out[6]:
    
In [7]:
    
credit.lm_summary_1d('Balance', 'Female')
    
    Out[7]:
In [8]:
    
credit['GenderDif'] = credit.apply(lambda x: 1 if x=='Female' else -1, 'Gender')
credit
    
    Out[8]:
In [9]:
    
credit.lm_summary_1d('Balance', 'GenderDif')
    
    Out[9]:
In [10]:
    
credit.pivot_hist('Ethnicity', 'Balance')
    
    
In [11]:
    
credit.pivot_hist('Ethnicity', 'Balance', normed=True)
    
    
In [12]:
    
credit['Ethnicity[Asian]'] = credit.apply(lambda x: 1 if x == 'Asian' else 0, 'Ethnicity')
credit['Ethnicity[Caucasian]'] = credit.apply(lambda x: 1 if x == 'Caucasian' else 0, 'Ethnicity')
eth_credit_balance = credit.select(['Balance', 'Ethnicity[Asian]', 'Ethnicity[Caucasian]'])
eth_credit_balance.linear_regression('Balance').summary()
    
    Out[12]:
In [13]:
    
eth_model = eth_credit_balance.linear_regression('Balance').model
eth_credit_balance.lm_fit('Balance', eth_model)
    
    Out[13]:
In [14]:
    
advertising = ML_Table.read_table("./data/Advertising.csv")
advertising = advertising.drop(0)
advertising['TVxRadio'] = advertising['TV']*advertising['Radio']
advertising
    
    Out[14]:
In [15]:
    
advertising.linear_regression('Sales').summary()
    
    Out[15]:
In [16]:
    
advertising.lm_fit('Sales', advertising.linear_regression('Sales').model)
    
    Out[16]:
The R2 for the model is 96.8 %, compared to only 89.7% for the model that predicts sales using TV and radio without an interaction term. This means that (96.8 − 89.7)/(100 − 89.7) = 69% of the variability in sales that remains after fitting the additive model has been explained by the interaction term.
The hierarchical principle states that if we include an interaction in a model, we should also include the main effects, even if the p-values associated with their coefficients are not significant.
In [17]:
    
credit['StudentFac'] = credit.apply(lambda x: 1 if x == 'Yes' else 0, 'Student')
credit
    
    Out[17]:
In [18]:
    
credit.scatter('Income', select='Balance')
    
    
In [19]:
    
credit.plot_fit_1d('Balance', 'Income', credit.regression_1d('Balance', 'Income'), height=6)
    
    Out[19]:
    
In [20]:
    
credit.lm_summary_1d('Balance', 'Income')
    
    Out[20]:
In [21]:
    
student_credit = credit.where('Student', 'Yes')
student_credit.plot_fit_1d('Balance', 'Income', student_credit.regression_1d('Balance', 'Income'), height=6)
student_credit.lm_summary_1d('Balance', 'Income')
    
    Out[21]:
    
In [22]:
    
nostudent_credit = credit.where('Student', 'No')
nostudent_credit.plot_fit_1d('Balance', 'Income', nostudent_credit.regression_1d('Balance', 'Income'), height=6)
nostudent_credit.lm_summary_1d('Balance', 'Income')
    
    Out[22]:
    
In [23]:
    
credit_stu_fac = credit.select(['Balance', 'Income', 'StudentFac'])
credit_stu_fac.linear_regression('Balance').summary()
    
    Out[23]:
In [24]:
    
credit_stu_fac['StudentIncome'] = credit_stu_fac['StudentFac']*credit_stu_fac['Income']
credit_stu_fac.linear_regression('Balance').summary()
    
    Out[24]:
"We note that the slope for students is lower than the slope for non-students. This suggests that increases in income are associated with smaller increases in credit card balance among students as compared to non-students."
But, that conclusion comes from only looking at the coefficients. There really isn't statistical evidence of this in the data set.
In [25]:
    
raw_auto = ML_Table.read_table("data/Auto.csv")
auto = raw_auto.where(raw_auto['horsepower'] != '?')
auto['horsepower'] = auto.apply(int, 'horsepower')
    
In [26]:
    
auto_model_1 = auto.regression_1d('mpg', 'horsepower')
auto.plot_fit_1d('mpg', 'horsepower', auto_model_1)
    
    Out[26]:
    
In [27]:
    
auto['HP^2'] = auto['horsepower']**2
    
In [28]:
    
auto
    
    Out[28]:
In [29]:
    
auto_mpg = auto.select(['mpg', 'horsepower', 'HP^2'])
auto_mpg.linear_regression('mpg').summary()
    
    Out[29]:
In [30]:
    
auto_model_2 = auto_mpg.linear_regression('mpg').model
auto_mpg.plot_fit('mpg', auto_model_2, width=8, height=6)
    
    Out[30]:
    
In [31]:
    
auto_mpg['Fit1'] = auto_mpg.apply(auto_model_1, 'horsepower')
auto_mpg['Residual 1'] = auto_mpg['mpg'] - auto_mpg['Fit1']
auto_mpg['Fit2'] = auto_mpg.apply(auto_model_2, ['horsepower', 'HP^2'])
auto_mpg['Residual 2'] = auto_mpg['mpg'] - auto_mpg['Fit2']
auto_mpg
    
    Out[31]:
In [32]:
    
auto_mpg.scatter('horsepower', ['Residual 1', 'Residual 2'], overlay=False)
    
    
In [33]:
    
auto_mpg.plot_fit_1d('Residual 1', 'Fit1', auto_mpg.poly('Residual 1', 'Fit1', 2), connect=False)
    
    Out[33]:
    
In [34]:
    
auto_mpg.plot_fit_1d('Residual 2', 'Fit2', auto_mpg.poly('Residual 2', 'Fit2', 2), connect=False)
    
    Out[34]:
    
In [35]:
    
credit['Age Leverage'] = credit.leverage_1d('Age')
credit.scatter('Age', 'Age Leverage')
    
    
In [36]:
    
credit.scatter('Limit', 'Age')
    
    
In [37]:
    
credit.scatter('Limit', 'Rating')
    
    
In [38]:
    
credit_al = credit.select(['Balance', 'Limit', 'Age'])
credit_al_model = credit_al.linear_regression('Balance').model
credit_al.linear_regression('Balance').summary()
    
    Out[38]:
In [39]:
    
credit_al.RSS_model('Balance', credit_al_model)
    
    Out[39]:
In [40]:
    
credit_al.RSS_contour2('Balance', x_sensitivity=0.15, y_sensitivity=1, scale=1/1000000, levels=[21.25, 21.5, 21.8])
    
    
    Out[40]:
    
In [41]:
    
credit_rl = credit.select(['Balance', 'Limit', 'Rating'])
credit_rl_model = credit_rl.linear_regression('Balance').model
credit_rl.linear_regression('Balance').summary()
    
    Out[41]:
In [42]:
    
credit_rl.lm_fit('Balance', credit_rl_model)
    
    Out[42]:
In [43]:
    
credit_rl.RSS_model('Balance', credit_rl_model)
    
    Out[43]:
In [44]:
    
credit_rl.RSS_contour2('Balance', x_sensitivity=10, y_sensitivity=1.5, scale=1/1000000, levels=[22, 25, 30, 50])
    
    
    Out[44]:
    
In [45]:
    
credit.Cor()
    
    Out[45]:
In [ ]: