In [ ]:

    
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA

plt.style.use('bmh')

%matplotlib inline



In [ ]:

    
digits = load_digits()  # subset of MNIST
X = digits.data
y = digits.target
print(digits.DESCR)



In [ ]:

    
fig, ax = plt.subplots(1,5, figsize=(16, 16))
for i in range(5):
    ax[i].imshow(digits.images[i], cmap='gray');



In [ ]:

    
y[:5]



In [ ]:

    
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=0)



In [ ]:

    
linear_model = LogisticRegressionCV(n_jobs=-1, refit=True, multi_class='ovr', random_state=0)
linear_model.fit(Xtr, ytr)
linear_preds = linear_model.predict(Xte)
print(accuracy_score(yte, linear_preds))

Very high accuracy for a linear model on a highly nonlinear problem.



In [ ]:

    
multinom = LogisticRegressionCV(n_jobs=-1, refit=True, multi_class='multinomial', random_state=0)
multinom.fit(Xtr, ytr)
multinom_preds = multinom.predict(Xte)
print(accuracy_score(yte, multinom_preds))



In [ ]:

    
# untuned (default) random forest model

rf_model_1 = RF(n_jobs = -1, random_state = 0)
rf_model_1.fit(Xtr, ytr)
rf_preds_1 = rf_model_1.predict(Xte)
print(accuracy_score(yte, rf_preds_1))



In [ ]:

    
# adjusting the number of trees

scores = []

for i in np.arange(10, 310, 10):
    rf_model = RF(n_estimators = i, max_features = 8, n_jobs = -1, random_state = 0)
    rf_model.fit(Xtr, ytr)
    rf_preds = rf_model.predict(Xte)
    scores.append(accuracy_score(yte, rf_preds))



In [ ]:

    
figsize(12, 6)
plt.plot(range(len(scores)), scores, marker='o')
plt.xlabel('number of trees')
labs = np.arange(10, 310, 10)
plt.xticks(range(len(scores)), labs, rotation=90)
plt.ylabel('accuracy');



In [ ]:

    
print(np.argmax(scores)*10+10)  # number of trees at argmax
print(np.max(scores))



In [ ]:

    
rf_model = RF(n_estimators = 110, n_jobs = -1, random_state = 0)
rf_model.fit(Xtr, ytr)
rf_preds = rf_model.predict(Xte)
print(accuracy_score(yte, rf_preds))

Another parameter that is always worth an attempt at tuning is the 'max_features' parameter. This controls the number of predictors chosen (at random) at each node. The default, which is usually decent, is

$$\text{max_features} = \sqrt{\text{number of predictors}}.$$



In [ ]:

    
# tuning max_features

for i in [3, 5, 8, 9, 10, 12, 15]:
    rf_model = RF(n_estimators = 4000, max_features = i, n_jobs = -1, random_state = 0)  # 4000 for stability 
    rf_model.fit(Xtr, ytr)
    rf_preds = rf_model.predict(Xte)
    print('max_features = %s; accuracy score: %s' % (i, accuracy_score(yte, rf_preds)))



In [ ]:

    
final = RF(n_estimators=120, max_features=12, n_jobs=-1, random_state=0)
final.fit(X, y)



In [ ]:

    
lm_cv_scores = cross_val_score(linear_model.fit(X, y), X=X, y=y, scoring='accuracy')
print(lm_cv_scores, np.mean(lm_cv_scores))
mn_cv_scores = cross_val_score(multinom.fit(X, y), X=X, y=y, scoring='accuracy')
print(mn_cv_scores, np.mean(mn_cv_scores))
rf_cv_scores = cross_val_score(final, X=X, y=y, scoring='accuracy')
print(rf_cv_scores, np.mean(rf_cv_scores))

Other quick adjustments: for images, it's common to simply mean-center the data. Scaling is not typically done as the pixels in an image are already scaled:



In [ ]:

    
np.mean(X)



In [ ]:

    
Xsc = X - np.mean(X)



In [ ]:

    
lm_cv_scaled_scores = cross_val_score(linear_model.fit(Xsc, y), X=Xsc, y=y, scoring='accuracy')
print(lm_cv_scaled_scores, np.mean(lm_cv_scaled_scores))
mn_cv_scaled_scores = cross_val_score(multinom.fit(Xsc, y), X=Xsc, y=y, scoring='accuracy')
print(mn_cv_scaled_scores, np.mean(mn_cv_scaled_scores))

Maybe we can understand the validation error with PCA?



In [ ]:

    
pca = PCA().fit(X)



In [ ]:

    
pca.explained_variance_ratio_



In [ ]:

    
Xre = PCA(n_components=2, whiten=True).fit_transform(X)
Xretr, Xrete, ytr, yte = train_test_split(Xre, y, test_size=0.2, random_state=0)



In [ ]:

    
figsize(10, 10)
plt.scatter(Xretr[:,0], Xretr[:, 1], color='blue', label='train')
plt.scatter(Xrete[:,0], Xrete[:,1], color='red', label='test')
plt.legend(loc='best');



In [ ]:

    
Xval = np.vstack([Xtr, Xte])



In [ ]:

    
print(Xtr.shape)
print(Xval.shape)



In [ ]:

    
sets = np.zeros(1797)
sets[1438:] = 1  # tag the test data



In [ ]:

    
val_mod = LogisticRegressionCV(scoring='accuracy', random_state=0, n_jobs=-1).fit(Xval, sets)



In [ ]:

    
roc_auc_score(sets, val_mod.predict(X))  # model can't distinguish train and test sets

So our train and validation sets are not different at the level detectable by a logistic regression model. We need a better model for data this complex!



In [ ]: