In [1]:
import numpy as np
from skimage.feature import hog
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn import datasets
from time import time

In [2]:
dataset = datasets.fetch_mldata("MNIST Original", data_home=".")

In [3]:
features = np.array(dataset.data, 'int16')
labels = np.array(dataset.target, 'int')

In [4]:
# 9 label이 붙은 index찾아서 지우기
f = []
l = []

for label, i in zip(labels, range(labels.shape[0])):
    if label != 9:
        f.append(features[i])
        l.append(labels[i])
        
features = np.array(f, 'int16')
labels = np.array(l, 'int')

In [5]:
t0 = time()
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)

features = features.astype("float32")
features = scale(features)

print "escape time : ", round(time()-t0, 3), "s"


escape time :  0.457 s

In [15]:
parameters = {'alpha' : [1, 10, 100, 1000]}
clf = grid_search.GridSearchCV(BernoulliNB(), parameters, n_jobs=-1, verbose=1)

In [16]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    3.7s remaining:    3.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.9s finished
escape time :  5.734 s
best score is 0.857111132261
best parameter is {'alpha': 1}

In [17]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_BernoulliNB.pkl", compress=3)


Out[17]:
['./pkl/scale/mnist/digits_BernoulliNB.pkl']

In [18]:
parameters = {'min_samples_split' : [2, 10, 15, 20], 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)

In [19]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 114 out of 120 | elapsed:  8.4min remaining:   26.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  9.2min finished
escape time :  587.678 s
best score is 0.972589702103
best parameter is {'min_samples_split': 2, 'n_estimators': 80}

In [20]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_RandomForestClassifier.pkl", compress=3)


Out[20]:
['./pkl/scale/mnist/digits_RandomForestClassifier.pkl']

In [21]:
parameters = {'n_neighbors' : [5, 10, 15, 20]}
clf = grid_search.GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, verbose=1)

In [22]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed: 40.6min remaining: 40.6min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 61.0min finished
escape time :  3691.731 s
best score is 0.973002125567
best parameter is {'n_neighbors': 5}

In [23]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_KNeighborsClassifier.pkl", compress=3)


Out[23]:
['./pkl/scale/mnist/digits_KNeighborsClassifier.pkl',
 './pkl/scale/mnist/digits_KNeighborsClassifier.pkl_01.npy.z',
 './pkl/scale/mnist/digits_KNeighborsClassifier.pkl_02.npy.z']

In [24]:
parameters = {'min_samples_split' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
clf = grid_search.GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, verbose=1)

In [25]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done  24 out of  30 | elapsed:  1.5min remaining:   21.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.9min finished
escape time :  134.192 s
best score is 0.88022270867
best parameter is {'min_samples_split': 5}

In [26]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_DecisionTreeClassifier.pkl", compress=3)


Out[26]:
['./pkl/scale/mnist/digits_DecisionTreeClassifier.pkl']

In [27]:
parameters = {'C':10. ** np.arange(1,5), 'gamma':2. ** np.arange(-5,-1)}
clf = grid_search.GridSearchCV(SVC(cache_size=1000, kernel="rbf"), parameters, n_jobs=-1, verbose=1)

In [29]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done  42 out of  48 | elapsed: 446.3min remaining: 63.8min
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 553.4min finished
escape time :  33658.405 s
best score is 0.986310713493
best parameter is {'C': 10.0, 'gamma': 0.03125}

In [30]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_SVC.pkl", compress=3)


Out[30]:
['./pkl/scale/mnist/digits_SVC.pkl']

In [6]:
parameters = {'C':10. ** np.arange(1,5)}
clf = grid_search.GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=1)

In [7]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:  4.0min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  6.2min finished
escape time :  507.943 s
best score is 0.913613146791
best parameter is {'C': 10.0}

In [8]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_LinearSVC.pkl", compress=3)


Out[8]:
['./pkl/scale/mnist/digits_LinearSVC.pkl']

In [9]:
print "SEARCHING LOGISTIC REGRESSION"
params = {"C": [1.0, 10.0, 100.0]}
start = time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs =-1, verbose =1)
gs.fit(features, labels)

# print diagnostic information to the user and grab the best model
print "done in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING LOGISTIC REGRESSION
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  6.0min finished
done in 396.757s
best score: 0.928
LOGISTIC REGRESSION PARAMETERS
	 C: 1.000000

In [11]:
clf = LogisticRegression(C = 1.0)
clf.fit(features, labels)
joblib.dump(clf, "./pkl/scale/mnist/digits_LogisticRegression.pkl", compress=3)


Out[11]:
['./pkl/scale/mnist/digits_LogisticRegression.pkl']

In [12]:
# initialize the RBM + Logistic Regression pipeline
rbm = BernoulliRBM()
logistic = LogisticRegression()
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])

# perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
# C for Logistic Regression
print "SEARCHING RBM + LOGISTIC REGRESSION"
params = {
    "rbm__learning_rate": [0.1, 0.01, 0.001],
    "rbm__n_iter": [20, 40, 80],
    "rbm__n_components": [50, 100, 200],
    "logistic__C": [1.0, 10.0, 100.0]}

# perform a grid search over the parameter
start = time()
gs = GridSearchCV(classifier, params, n_jobs =-1, verbose =1)
gs.fit(features, labels)

# print diagnostic information to the user and grab the best model
print "\ndone in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "RBM + LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING RBM + LOGISTIC REGRESSION
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed: 109.5min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 481.2min
[Parallel(n_jobs=-1)]: Done 237 out of 243 | elapsed: 577.6min remaining: 14.6min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 610.0min finished
done in 37656.420s
best score: 0.965
RBM + LOGISTIC REGRESSION PARAMETERS
	 logistic__C: 10.000000
	 rbm__learning_rate: 0.010000
	 rbm__n_components: 200.000000
	 rbm__n_iter: 80.000000

In [14]:
rbm = BernoulliRBM(n_components = 200, n_iter = 20, learning_rate = 0.01,  verbose = True)
logistic = LogisticRegression(C = 10.0)
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
classifier.fit(features, labels)
joblib.dump(classifier, "./pkl/scale/mnist/digits_LogisticRegression_BernoulliRBM.pkl", compress=3)


[BernoulliRBM] Iteration 1, pseudo-likelihood = -115.42, time = 15.11s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -99.54, time = 15.35s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -91.00, time = 15.21s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -86.31, time = 15.18s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -85.49, time = 15.25s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -83.11, time = 15.22s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -80.92, time = 15.20s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -77.90, time = 15.21s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -79.70, time = 15.22s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -75.76, time = 15.19s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -76.44, time = 15.22s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -77.50, time = 15.21s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -76.86, time = 15.21s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -76.16, time = 15.20s
[BernoulliRBM] Iteration 15, pseudo-likelihood = -75.68, time = 15.22s
[BernoulliRBM] Iteration 16, pseudo-likelihood = -74.08, time = 15.21s
[BernoulliRBM] Iteration 17, pseudo-likelihood = -73.65, time = 15.23s
[BernoulliRBM] Iteration 18, pseudo-likelihood = -72.68, time = 15.19s
[BernoulliRBM] Iteration 19, pseudo-likelihood = -73.84, time = 15.23s
[BernoulliRBM] Iteration 20, pseudo-likelihood = -73.24, time = 15.23s
Out[14]:
['./pkl/scale/mnist/digits_LogisticRegression_BernoulliRBM.pkl']