In [ ]:
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA
plt.style.use('bmh')
%matplotlib inline
In [ ]:
digits = load_digits() # subset of MNIST
X = digits.data
y = digits.target
print(digits.DESCR)
In [ ]:
fig, ax = plt.subplots(1,5, figsize=(16, 16))
for i in range(5):
ax[i].imshow(digits.images[i], cmap='gray');
In [ ]:
y[:5]
In [ ]:
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=0)
In [ ]:
linear_model = LogisticRegressionCV(n_jobs=-1, refit=True, multi_class='ovr', random_state=0)
linear_model.fit(Xtr, ytr)
linear_preds = linear_model.predict(Xte)
print(accuracy_score(yte, linear_preds))
Very high accuracy for a linear model on a highly nonlinear problem.
In [ ]:
multinom = LogisticRegressionCV(n_jobs=-1, refit=True, multi_class='multinomial', random_state=0)
multinom.fit(Xtr, ytr)
multinom_preds = multinom.predict(Xte)
print(accuracy_score(yte, multinom_preds))
In [ ]:
# untuned (default) random forest model
rf_model_1 = RF(n_jobs = -1, random_state = 0)
rf_model_1.fit(Xtr, ytr)
rf_preds_1 = rf_model_1.predict(Xte)
print(accuracy_score(yte, rf_preds_1))
In [ ]:
# adjusting the number of trees
scores = []
for i in np.arange(10, 310, 10):
rf_model = RF(n_estimators = i, max_features = 8, n_jobs = -1, random_state = 0)
rf_model.fit(Xtr, ytr)
rf_preds = rf_model.predict(Xte)
scores.append(accuracy_score(yte, rf_preds))
In [ ]:
figsize(12, 6)
plt.plot(range(len(scores)), scores, marker='o')
plt.xlabel('number of trees')
labs = np.arange(10, 310, 10)
plt.xticks(range(len(scores)), labs, rotation=90)
plt.ylabel('accuracy');
In [ ]:
print(np.argmax(scores)*10+10) # number of trees at argmax
print(np.max(scores))
In [ ]:
rf_model = RF(n_estimators = 110, n_jobs = -1, random_state = 0)
rf_model.fit(Xtr, ytr)
rf_preds = rf_model.predict(Xte)
print(accuracy_score(yte, rf_preds))
Another parameter that is always worth an attempt at tuning is the 'max_features' parameter. This controls the number of predictors chosen (at random) at each node. The default, which is usually decent, is
$$\text{max_features} = \sqrt{\text{number of predictors}}.$$
In [ ]:
# tuning max_features
for i in [3, 5, 8, 9, 10, 12, 15]:
rf_model = RF(n_estimators = 4000, max_features = i, n_jobs = -1, random_state = 0) # 4000 for stability
rf_model.fit(Xtr, ytr)
rf_preds = rf_model.predict(Xte)
print('max_features = %s; accuracy score: %s' % (i, accuracy_score(yte, rf_preds)))
In [ ]:
final = RF(n_estimators=120, max_features=12, n_jobs=-1, random_state=0)
final.fit(X, y)
In [ ]:
lm_cv_scores = cross_val_score(linear_model.fit(X, y), X=X, y=y, scoring='accuracy')
print(lm_cv_scores, np.mean(lm_cv_scores))
mn_cv_scores = cross_val_score(multinom.fit(X, y), X=X, y=y, scoring='accuracy')
print(mn_cv_scores, np.mean(mn_cv_scores))
rf_cv_scores = cross_val_score(final, X=X, y=y, scoring='accuracy')
print(rf_cv_scores, np.mean(rf_cv_scores))
Other quick adjustments: for images, it's common to simply mean-center the data. Scaling is not typically done as the pixels in an image are already scaled:
In [ ]:
np.mean(X)
In [ ]:
Xsc = X - np.mean(X)
In [ ]:
lm_cv_scaled_scores = cross_val_score(linear_model.fit(Xsc, y), X=Xsc, y=y, scoring='accuracy')
print(lm_cv_scaled_scores, np.mean(lm_cv_scaled_scores))
mn_cv_scaled_scores = cross_val_score(multinom.fit(Xsc, y), X=Xsc, y=y, scoring='accuracy')
print(mn_cv_scaled_scores, np.mean(mn_cv_scaled_scores))
Maybe we can understand the validation error with PCA?
In [ ]:
pca = PCA().fit(X)
In [ ]:
pca.explained_variance_ratio_
In [ ]:
Xre = PCA(n_components=2, whiten=True).fit_transform(X)
Xretr, Xrete, ytr, yte = train_test_split(Xre, y, test_size=0.2, random_state=0)
In [ ]:
figsize(10, 10)
plt.scatter(Xretr[:,0], Xretr[:, 1], color='blue', label='train')
plt.scatter(Xrete[:,0], Xrete[:,1], color='red', label='test')
plt.legend(loc='best');
In [ ]:
Xval = np.vstack([Xtr, Xte])
In [ ]:
print(Xtr.shape)
print(Xval.shape)
In [ ]:
sets = np.zeros(1797)
sets[1438:] = 1 # tag the test data
In [ ]:
val_mod = LogisticRegressionCV(scoring='accuracy', random_state=0, n_jobs=-1).fit(Xval, sets)
In [ ]:
roc_auc_score(sets, val_mod.predict(X)) # model can't distinguish train and test sets
So our train and validation sets are not different at the level detectable by a logistic regression model. We need a better model for data this complex!
In [ ]: