From the github repo: https://github.com/JWarmenhoven/ISLR-python which is based on the book by James et al. Intro to Statistical Learning. There are no exercises, but it should serve as a great reference.
Running Exercise For the classification problems compare ROC and PR curves, also compare to SVMs.
In [12]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.set_option('display.notebook_repr_html', False)
%matplotlib inline
plt.style.use('seaborn-white')
In [6]:
# In R, I exported the dataset from package 'ISLR' to an Excel file
df = pd.read_excel('../data/Default.xlsx')
# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object.
df['default2'] = df.default.factorize()[0]
df['student2'] = df.student.factorize()[0]
df.head(3)
Out[6]:
In [7]:
fig = plt.figure(figsize=(12,5))
gs = mpl.gridspec.GridSpec(1, 4)
ax1 = plt.subplot(gs[0,:-2])
ax2 = plt.subplot(gs[0,-2])
ax3 = plt.subplot(gs[0,-1])
# Take a fraction of the samples where target value (default) is 'no'
df_no = df[df.default2 == 0].sample(frac=0.15)
# Take all samples where target value is 'yes'
df_yes = df[df.default2 == 1]
df_ = df_no.append(df_yes)
ax1.scatter(df_[df_.default == 'Yes'].balance, df_[df_.default == 'Yes'].income, s=40, c='orange', marker='+',
linewidths=1)
ax1.scatter(df_[df_.default == 'No'].balance, df_[df_.default == 'No'].income, s=40, marker='o', linewidths='1',
edgecolors='lightblue', facecolors='none')
ax1.set_ylim(ymin=0)
ax1.set_ylabel('Income')
ax1.set_xlim(xmin=-100)
ax1.set_xlabel('Balance')
c_palette = {'No':'lightblue', 'Yes':'orange'}
sns.boxplot('default', 'balance', data=df, orient='v', ax=ax2, palette=c_palette)
sns.boxplot('default', 'income', data=df, orient='v', ax=ax3, palette=c_palette)
gs.tight_layout(plt.gcf())
In [8]:
X_train = df.balance.reshape(-1,1)
y = df.default2
# Create array of test data. Calculate the classification probability
# and predicted classification.
X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(X_train,y)
prob = clf.predict_proba(X_test)
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))
# Left plot
sns.regplot(df.balance, df.default2, order=1, ci=None,
scatter_kws={'color':'orange'},
line_kws={'color':'lightblue', 'lw':2}, ax=ax1)
# Right plot
ax2.scatter(X_train, y, color='orange')
ax2.plot(X_test, prob[:,1], color='lightblue')
for ax in fig.axes:
ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],
xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],
xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.set_ylabel('Probability of default')
ax.set_xlabel('Balance')
ax.set_yticks([0, 0.25, 0.5, 0.75, 1.])
ax.set_xlim(xmin=-100)
In [9]:
y = df.default2
In [10]:
# Using newton-cg solver, the coefficients are equal/closest to the ones in the book.
# I do not know the details on the differences between the solvers.
clf = skl_lm.LogisticRegression(solver='newton-cg')
X_train = df.balance.reshape(-1,1)
clf.fit(X_train,y)
print(clf)
print('classes: ',clf.classes_)
print('coefficients: ',clf.coef_)
print('intercept :', clf.intercept_)
In [13]:
X_train = sm.add_constant(df.balance)
est = smf.Logit(y.ravel(), X_train).fit()
est.summary().tables[1]
Out[13]:
In [14]:
X_train = sm.add_constant(df.student2)
y = df.default2
est = smf.Logit(y, X_train).fit()
est.summary().tables[1]
Out[14]:
In [15]:
X_train = sm.add_constant(df[['balance', 'income', 'student2']])
est = smf.Logit(y, X_train).fit()
est.summary().tables[1]
Out[15]:
In [16]:
# balance and default vectors for students
X_train = df[df.student == 'Yes'].balance.reshape(df[df.student == 'Yes'].balance.size,1)
y = df[df.student == 'Yes'].default2
# balance and default vectors for non-students
X_train2 = df[df.student == 'No'].balance.reshape(df[df.student == 'No'].balance.size,1)
y2 = df[df.student == 'No'].default2
# Vector with balance values for plotting
X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf2 = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(X_train,y)
clf2.fit(X_train2,y2)
prob = clf.predict_proba(X_test)
prob2 = clf2.predict_proba(X_test)
In [17]:
df.groupby(['student','default']).size().unstack('default')
Out[17]:
In [18]:
# creating plot
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))
# Left plot
ax1.plot(X_test, pd.DataFrame(prob)[1], color='orange', label='Student')
ax1.plot(X_test, pd.DataFrame(prob2)[1], color='lightblue', label='Non-student')
ax1.hlines(127/2817, colors='orange', label='Overall Student',
xmin=ax1.xaxis.get_data_interval()[0],
xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')
ax1.hlines(206/6850, colors='lightblue', label='Overall Non-Student',
xmin=ax1.xaxis.get_data_interval()[0],
xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')
ax1.set_ylabel('Default Rate')
ax1.set_xlabel('Credit Card Balance')
ax1.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.])
ax1.set_xlim(450,2500)
ax1.legend(loc=2)
# Right plot
sns.boxplot('student', 'balance', data=df, orient='v', ax=ax2, palette=c_palette);
In [19]:
X = df[['balance', 'income', 'student2']].as_matrix()
y = df.default2.as_matrix()
lda = LinearDiscriminantAnalysis(solver='svd')
y_pred = lda.fit(X, y).predict(X)
df_ = pd.DataFrame({'True default status': y,
'Predicted default status': y_pred})
df_.replace(to_replace={0:'No', 1:'Yes'}, inplace=True)
df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')
Out[19]:
In [20]:
print(classification_report(y, y_pred, target_names=['No', 'Yes']))
In [21]:
decision_prob = 0.2
y_prob = lda.fit(X, y).predict_proba(X)
df_ = pd.DataFrame({'True default status': y,
'Predicted default status': y_prob[:,1] > decision_prob})
df_.replace(to_replace={0:'No', 1:'Yes', 'True':'Yes', 'False':'No'}, inplace=True)
df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')
Out[21]:
In [24]:
df = pd.read_csv('../data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
In [25]:
X_train = df[:'2004'][['Lag1','Lag2']]
y_train = df[:'2004']['Direction']
X_test = df['2005':][['Lag1','Lag2']]
y_test = df['2005':]['Direction']
lda = LinearDiscriminantAnalysis()
pred = lda.fit(X_train, y_train).predict(X_test)
In [26]:
lda.priors_
Out[26]:
In [27]:
lda.means_
Out[27]:
In [28]:
# These do not seem to correspond to the values from the R output in the book?
lda.coef_
Out[28]:
In [29]:
confusion_matrix(y_test, pred).T
Out[29]:
In [30]:
print(classification_report(y_test, pred, digits=3))
In [31]:
pred_p = lda.predict_proba(X_test)
In [32]:
np.unique(pred_p[:,1]>0.5, return_counts=True)
Out[32]:
In [33]:
np.unique(pred_p[:,1]>0.9, return_counts=True)
Out[33]:
In [34]:
qda = QuadraticDiscriminantAnalysis()
pred = qda.fit(X_train, y_train).predict(X_test)
In [35]:
qda.priors_
Out[35]:
In [36]:
qda.means_
Out[36]:
In [37]:
confusion_matrix(y_test, pred).T
Out[37]:
In [38]:
print(classification_report(y_test, pred, digits=3))
In [39]:
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))
In [40]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))
In [46]:
# In R, I exported the dataset from package 'ISLR' to a csv file
df = pd.read_csv('../data/Caravan.csv')
y = df.Purchase
X = df.drop('Purchase', axis=1).astype('float64')
X_scaled = preprocessing.scale(X)
X_train = X_scaled[1000:,:]
y_train = y[1000:]
X_test = X_scaled[:1000,:]
y_test = y[:1000]
def KNN(n_neighbors=1, weights='uniform'):
clf = neighbors.KNeighborsClassifier(n_neighbors, weights)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
return(pred, score, clf.classes_)
def plot_confusion_matrix(cm, classes, n_neighbors, title='Confusion matrix (Normalized)',
cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Normalized confusion matrix: KNN-{}'.format(n_neighbors))
plt.colorbar()
plt.xticks(np.arange(2), classes)
plt.yticks(np.arange(2), classes)
plt.tight_layout()
plt.xlabel('True label',rotation='horizontal', ha='right')
plt.ylabel('Predicted label')
plt.show()
In [47]:
for i in [1,3,5]:
pred, score, classes = KNN(i)
cm = confusion_matrix(y_test, pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plot_confusion_matrix(cm_normalized.T, classes, n_neighbors=i)
cm_df = pd.DataFrame(cm.T, index=classes, columns=classes)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)
print(pd.DataFrame(precision_score(y_test, pred, average=None),
index=classes, columns=['Precision']))
In [48]:
regr = skl_lm.LogisticRegression()
regr.fit(X_train, y_train)
Out[48]:
In [49]:
pred = regr.predict(X_test)
cm_df = pd.DataFrame(confusion_matrix(y_test, pred).T, index=regr.classes_,
columns=regr.classes_)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)
print(classification_report(y_test, pred))
In [50]:
pred_p = regr.predict_proba(X_test)
cm_df = pd.DataFrame({'True': y_test, 'Pred': pred_p[:,1] > .25})
cm_df.Pred.replace(to_replace={True:'Yes', False:'No'}, inplace=True)
print(cm_df.groupby(['True', 'Pred']).size().unstack('True').T)
print(classification_report(y_test, cm_df.Pred))