In [ ]:
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA

plt.style.use('bmh')

%matplotlib inline

In [ ]:
digits = load_digits()  # subset of MNIST
X = digits.data
y = digits.target
print(digits.DESCR)

In [ ]:
fig, ax = plt.subplots(1,5, figsize=(16, 16))
for i in range(5):
    ax[i].imshow(digits.images[i], cmap='gray');

In [ ]:
y[:5]

In [ ]:
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=0)

In [ ]:
linear_model = LogisticRegressionCV(n_jobs=-1, refit=True, multi_class='ovr', random_state=0)
linear_model.fit(Xtr, ytr)
linear_preds = linear_model.predict(Xte)
print(accuracy_score(yte, linear_preds))

Very high accuracy for a linear model on a highly nonlinear problem.


In [ ]:
multinom = LogisticRegressionCV(n_jobs=-1, refit=True, multi_class='multinomial', random_state=0)
multinom.fit(Xtr, ytr)
multinom_preds = multinom.predict(Xte)
print(accuracy_score(yte, multinom_preds))

In [ ]:
# untuned (default) random forest model

rf_model_1 = RF(n_jobs = -1, random_state = 0)
rf_model_1.fit(Xtr, ytr)
rf_preds_1 = rf_model_1.predict(Xte)
print(accuracy_score(yte, rf_preds_1))

In [ ]:
# adjusting the number of trees

scores = []

for i in np.arange(10, 310, 10):
    rf_model = RF(n_estimators = i, max_features = 8, n_jobs = -1, random_state = 0)
    rf_model.fit(Xtr, ytr)
    rf_preds = rf_model.predict(Xte)
    scores.append(accuracy_score(yte, rf_preds))

In [ ]:
figsize(12, 6)
plt.plot(range(len(scores)), scores, marker='o')
plt.xlabel('number of trees')
labs = np.arange(10, 310, 10)
plt.xticks(range(len(scores)), labs, rotation=90)
plt.ylabel('accuracy');

In [ ]:
print(np.argmax(scores)*10+10)  # number of trees at argmax
print(np.max(scores))

In [ ]:
rf_model = RF(n_estimators = 110, n_jobs = -1, random_state = 0)
rf_model.fit(Xtr, ytr)
rf_preds = rf_model.predict(Xte)
print(accuracy_score(yte, rf_preds))

Another parameter that is always worth an attempt at tuning is the 'max_features' parameter. This controls the number of predictors chosen (at random) at each node. The default, which is usually decent, is

$$\text{max_features} = \sqrt{\text{number of predictors}}.$$

In [ ]:
# tuning max_features

for i in [3, 5, 8, 9, 10, 12, 15]:
    rf_model = RF(n_estimators = 4000, max_features = i, n_jobs = -1, random_state = 0)  # 4000 for stability 
    rf_model.fit(Xtr, ytr)
    rf_preds = rf_model.predict(Xte)
    print('max_features = %s; accuracy score: %s' % (i, accuracy_score(yte, rf_preds)))

In [ ]:
final = RF(n_estimators=120, max_features=12, n_jobs=-1, random_state=0)
final.fit(X, y)

In [ ]:
lm_cv_scores = cross_val_score(linear_model.fit(X, y), X=X, y=y, scoring='accuracy')
print(lm_cv_scores, np.mean(lm_cv_scores))
mn_cv_scores = cross_val_score(multinom.fit(X, y), X=X, y=y, scoring='accuracy')
print(mn_cv_scores, np.mean(mn_cv_scores))
rf_cv_scores = cross_val_score(final, X=X, y=y, scoring='accuracy')
print(rf_cv_scores, np.mean(rf_cv_scores))

Other quick adjustments: for images, it's common to simply mean-center the data. Scaling is not typically done as the pixels in an image are already scaled:


In [ ]:
np.mean(X)

In [ ]:
Xsc = X - np.mean(X)

In [ ]:
lm_cv_scaled_scores = cross_val_score(linear_model.fit(Xsc, y), X=Xsc, y=y, scoring='accuracy')
print(lm_cv_scaled_scores, np.mean(lm_cv_scaled_scores))
mn_cv_scaled_scores = cross_val_score(multinom.fit(Xsc, y), X=Xsc, y=y, scoring='accuracy')
print(mn_cv_scaled_scores, np.mean(mn_cv_scaled_scores))

Maybe we can understand the validation error with PCA?


In [ ]:
pca = PCA().fit(X)

In [ ]:
pca.explained_variance_ratio_

In [ ]:
Xre = PCA(n_components=2, whiten=True).fit_transform(X)
Xretr, Xrete, ytr, yte = train_test_split(Xre, y, test_size=0.2, random_state=0)

In [ ]:
figsize(10, 10)
plt.scatter(Xretr[:,0], Xretr[:, 1], color='blue', label='train')
plt.scatter(Xrete[:,0], Xrete[:,1], color='red', label='test')
plt.legend(loc='best');

In [ ]:
Xval = np.vstack([Xtr, Xte])

In [ ]:
print(Xtr.shape)
print(Xval.shape)

In [ ]:
sets = np.zeros(1797)
sets[1438:] = 1  # tag the test data

In [ ]:
val_mod = LogisticRegressionCV(scoring='accuracy', random_state=0, n_jobs=-1).fit(Xval, sets)

In [ ]:
roc_auc_score(sets, val_mod.predict(X))  # model can't distinguish train and test sets

So our train and validation sets are not different at the level detectable by a logistic regression model. We need a better model for data this complex!


In [ ]: