Concepts and data from "An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani " available at www.StatLearning.com.
For Tables reference see http://data8.org/datascience/tables.html
In [63]:
# HIDDEN
# For Tables reference see http://data8.org/datascience/tables.html
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from sklearn import linear_model
plots.style.use('fivethirtyeight')
plots.rc('lines', linewidth=1, color='r')
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
# datascience version number of last run of this notebook
version.__version__
import sys
sys.path.append("..")
from ml_table import ML_Table
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
Out[63]:
In logistic regression, we use the logistic function,
$p(X) = \frac{e^{β_0 + β_1X}}{1 + e^{β0+β1X}}$
we seek estimates for β0 and β1 such that the predicted probability $\hat{p}(x_i) for each input corresponds as closely as possible to the individual’s observed with that input.
The estimates βˆ0 and βˆ1 are chosen to maximize the likelihood function.
In [64]:
# Simulation of a process that produces a 1d training set
n1 = 100
eps1 = 0.2
test1 = ML_Table.runiform('ix', n1)
# Categories
test1['Cat'] = test1.apply(lambda x: 'A' if x < 0.2 else 'B', 'ix')
# Noise in the relationship of input to category
test1['x'] = test1['ix'] + eps1*np.random.normal(size=n1)
test1['Class'] = test1.apply(lambda x: 0 if x == 'A' else 1, 'Cat')
test1 = test1.drop('ix')
test1.scatter('x', 'Class')
We could look at this in terms of the distribution of each of the categories over the input parameter, but it doesn't capture the concept of what is the likelihood of each category, given the input.
In [65]:
test1.pivot_hist('Cat', 'x')
This would be the density
In [66]:
cc = test1.density('Cat', 'x').scatter('x')
Logistical regression to predict 'Class', given input 'x'
In [67]:
logit1d = test1.logit_regression('Class', 'x')
In [68]:
# Visualize the accuracy of the classifier on the training set (training error)
test1.plot_fit_1d('Class', 'x', logit1d.model)
Out[68]:
In [69]:
test1.classification_error_model('Class', logit1d.model, 'x')
Out[69]:
In [70]:
test1.plot_fit_1d('Class', 'x', logit1d.likelihood)
Out[70]:
In [71]:
logit1d.params
Out[71]:
In [72]:
# Compute the cutting "plane", i.e., point for the claissifier
# p(x) = 0.5 where x = -b0/b1
p50 = -logit1d.params[0]/logit1d.params[1][0]
p50
Out[72]:
In [73]:
logit1d.likelihood(-1), logit1d.likelihood(p50), logit1d.likelihood(1)
Out[73]:
In [12]:
n = 200
eps = 0.1
test2 = ML_Table.runiform('ix', n)
test2['iy'] = np.random.rand(n)
test2['Cat'] = test2.apply(lambda x, y: 'A' if x+y <0 else 'B', ['ix', 'iy'])
test2['Class A'] = test2.apply(lambda x: 1 if x=='A' else 0, 'Cat')
test2['x'] = test2['ix'] + eps*np.random.normal(size=n)
test2['y'] = test2['iy'] + eps*np.random.normal(size=n)
In [13]:
test2.pivot_scatter('Cat', 'x', 'y')
In [14]:
logit2d = test2.logit_regression('Class A', ['x', 'y'])
model_2d = logit2d.model
In [15]:
test2.plot_fit_2d('Class A', 'x', 'y', model_2d)
Out[15]:
In [74]:
test2.plot_cut_2d('Class A', 'x', 'y', model_2d, n_grid=50)
Out[74]:
In [17]:
# error rate
test2.classification_error_model('Class A', model_2d, ['x', 'y'])
Out[17]:
In [18]:
test2.plot_cut_2d('Class A', 'x', 'y', logit2d.likelihood, n_grid=50)
Out[18]:
In [19]:
knn_reg = test2.knn_regression('Class A', ['x', 'y'], n_neighbors=3)
test2.plot_cut_2d('Class A', 'x', 'y', knn_reg.model, n_grid=50, levels=[0,1])
test2.classification_error_model('Class A', knn_reg.model, ['x', 'y'])
Out[19]:
In [ ]:
In [20]:
raw_default = ML_Table.read_table("data/Default.csv")
raw_default
Out[20]:
In [21]:
default = raw_default.drop('Unnamed: 0')
default['Default'] = np.where(default['default']=='Yes', 1, 0)
default['Student'] = np.where(default['student']=='Yes', 1, 0)
default
Out[21]:
In [22]:
# Look at the trend in the data
default.density('default', 'balance').scatter('balance')
In [23]:
# Look at the trend in the data
default.density('default', 'income').scatter('income')
In [24]:
# Predict default based on balance
default_balance = default.logit_regression('Default', 'balance')
In [25]:
default_balance.summary()
Out[25]:
In [26]:
# Default when balance gets too high
default.plot_fit_1d('Default', 'balance', default_balance.model, connect=False)
Out[26]:
In [27]:
default.classification_error_model('Default', default_balance.model, 'balance')
Out[27]:
In [28]:
# How impressive is this error rate?
default.where('Default').num_rows/default.num_rows
Out[28]:
In [29]:
default.plot_fit_1d('Default', 'balance', default_balance.likelihood, connect=False)
Out[29]:
In [30]:
default_balance.likelihood(1000), default_balance.likelihood(2000)
Out[30]:
In [31]:
default_balance.obj.decision_function([[1000], [2000]])
Out[31]:
In [32]:
default.pivot_scatter('Default', 'income', 'balance')
In [33]:
# Predict default based on balance
default_BI = default.logit_regression('Default', ['balance', 'income'])
In [34]:
default_BI.summary()
Out[34]:
In [35]:
default_BI.obj.decision_function([[10000, 1000]])
Out[35]:
In [36]:
# This classifier does not seem to work at all on this data
default.plot_cut_2d('Default', 'balance','income', default_BI.model, levels=[0,1])
Out[36]:
In [37]:
# basically discarded all the defaults
default.classification_error_model('Default', default_BI.model, ['balance', 'income'])
Out[37]:
In [38]:
default.logit_regression('Default', ['balance', 'income', 'Student']).summary()
Out[38]:
In [39]:
default_knn_BI = default.knn_regression('Default', ['balance', 'income'], n_neighbors=5)
default.plot_cut_2d('Default', 'balance', 'income', default_knn_BI.model, levels=[0,1])
default.classification_error_model('Default', default_knn_BI.model, ['balance', 'income'])
Out[39]:
In [40]:
default.where('default', 'Yes').pivot_scatter('default', 'balance', 'income')
The book claims to have been able to do some clasification with balance and income, but it is very hard to find a cut that pulls out the defaults without as many false positives.
In [41]:
default.where('default', 'No').pivot_scatter('default', 'balance', 'income')
In [42]:
default_sample = default.sample(1000)
In [43]:
default_sample.where('default', 'Yes').num_rows, default_sample.num_rows
Out[43]:
In [44]:
ds_lr = default_sample.logit_regression('Default', ['income', 'balance'])
default_sample.plot_cut_2d('Default', 'income', 'balance', ds_lr.likelihood, levels=[0,1])
Out[44]:
In [45]:
ds_lr.summary()
Out[45]:
In [46]:
default_student = default.logit_regression('Default', 'Student')
In [47]:
default_student.summary()
Out[47]:
In [48]:
default_student.likelihood(0), default_student.likelihood(1)
Out[48]:
In [49]:
default_BIS = default.logit_regression('Default', ['balance', 'income', 'Student'])
default_BIS.summary()
Out[49]:
In [50]:
default.summary()
Out[50]:
In [51]:
default_rates_stu = default.where('student', 'Yes').density('default', 'balance', bins=np.arange(0, 3000, 100)).drop('No')
default_rates_stu.relabel('Yes', 'Student')
default_rates_no = default.where('student', 'No').density('default', 'balance', bins=np.arange(0, 3000, 100)).drop('No')
default_rates_no.relabel('Yes', 'Non-Student')
default_rates = default_rates_stu.join('balance', default_rates_no)
default_rates.plot('balance')
In [52]:
raw_credit = ML_Table.read_table("data/Credit.csv")
credit = raw_credit.drop('Unnamed: 0')
credit['Gender'] = credit.apply(lambda x:x.strip(), 'Gender')
credit
Out[52]:
In [53]:
credit.where('Gender', 'Female').num_rows
Out[53]:
In [54]:
credit.where('Gender', 'Male').num_rows
Out[54]:
In [55]:
credit['Female'] = credit.apply(lambda x: 1 if x=='Female' else 0, 'Gender')
In [56]:
credit.Cor()
Out[56]:
In [57]:
credit.pivot_scatter('Student', 'Balance', select=['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education'])
In [58]:
credit['Student Class'] = credit.apply(lambda x: 1 if x=='Yes' else 0, 'Student')
credit
Out[58]:
In [59]:
cr = credit.logit_regression('Student Class', ['Balance', 'Limit'])
credit.plot_cut_2d('Student', 'Balance', 'Limit', cr.model, levels=[0,1])
print('error', credit.classification_error_model('Student Class', cr.model, ['Balance', 'Limit']))
print('density', credit.where('Student Class').num_rows/credit.num_rows)
In [60]:
credit.plot_cut_2d('Student', 'Balance', 'Limit', cr.likelihood)
Out[60]:
In [61]:
lda = credit.LDA('Student Class', ['Balance', 'Limit'])
credit.plot_cut_2d('Student', 'Balance', 'Limit', lda.model, levels=[0,1])
print('error', credit.classification_error_model('Student Class', lda.model, ['Balance', 'Limit']))
In [62]:
default_lda_BI = default.LDA('Default', ['balance', 'income'])
default.plot_cut_2d('Default', 'balance', 'income', default_lda_BI.model, levels=[0,1])
default.classification_error_model('Default', default_lda_BI.model, ['balance', 'income'])
Out[62]:
In [ ]: