In [1]:
import numpy as np
from skimage.feature import hog
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn import datasets
from time import time

In [2]:
dataset = datasets.fetch_mldata("MNIST Original", data_home=".")

In [3]:
features = np.array(dataset.data, 'int16')
labels = np.array(dataset.target, 'int')

In [4]:
# 9 label이 붙은 index찾아서 지우기
f = []
l = []

for label, i in zip(labels, range(labels.shape[0])):
    if label != 9:
        f.append(features[i])
        l.append(labels[i])
        
features = np.array(f, 'int16')
labels = np.array(l, 'int')

In [5]:
t0 = time()
list_hog_fd = []
for feature in features:
    fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"


escape time :  53.523 s

In [10]:
parameters = {'alpha' : [1, 10, 100, 1000]}
clf = grid_search.GridSearchCV(BernoulliNB(), parameters, n_jobs=-1, verbose=1)

In [11]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.8s finished
escape time :  0.947 s
best score is 0.631626534691
best parameter is {'alpha': 1}

In [12]:
joblib.dump(clf.best_estimator_, "./pkl/hog/mnist/digits_BernoulliNB.pkl", compress=3)


Out[12]:
['./pkl/hog/mnist/digits_BernoulliNB.pkl']

In [13]:
parameters = {'min_samples_split' : [2, 10, 15, 20], 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)

In [14]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 114 out of 120 | elapsed:  6.6min remaining:   20.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  7.3min finished
escape time :  477.129 s
best score is 0.935630214777
best parameter is {'min_samples_split': 2, 'n_estimators': 100}

In [15]:
joblib.dump(clf.best_estimator_, "./pkl/hog/mnist/digits_RandomForestClassifier.pkl", compress=3)


Out[15]:
['./pkl/hog/mnist/digits_RandomForestClassifier.pkl']

In [16]:
parameters = {'n_neighbors' : [5, 10, 15, 20]}
clf = grid_search.GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, verbose=1)

In [17]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.0min finished
escape time :  179.296 s
best score is 0.931617017227
best parameter is {'n_neighbors': 10}

In [18]:
joblib.dump(clf.best_estimator_, "./pkl/hog/mnist/digits_KNeighborsClassifier.pkl", compress=3)


Out[18]:
['./pkl/hog/mnist/digits_KNeighborsClassifier.pkl']

In [19]:
parameters = {'min_samples_split' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
clf = grid_search.GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, verbose=1)

In [20]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  24 out of  30 | elapsed:   18.6s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   24.3s finished
escape time :  29.05 s
best score is 0.820088195171
best parameter is {'min_samples_split': 10}

In [21]:
joblib.dump(clf.best_estimator_, "./pkl/hog/mnist/digits_DecisionTreeClassifier.pkl", compress=3)


Out[21]:
['./pkl/hog/mnist/digits_DecisionTreeClassifier.pkl']

In [22]:
parameters = {'C':10. ** np.arange(1,5), 'gamma':2. ** np.arange(-5,-1)}
clf = grid_search.GridSearchCV(SVC(cache_size=1000, kernel="rbf"), parameters, n_jobs=-1, verbose=1)

In [23]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done  42 out of  48 | elapsed:  5.1min remaining:   43.5s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.8min finished
escape time :  368.873 s
best score is 0.952571301672
best parameter is {'C': 100.0, 'gamma': 0.25}

In [24]:
joblib.dump(clf.best_estimator_, "./pkl/hog/mnist/digits_SVC.pkl", compress=3)


Out[24]:
['./pkl/hog/mnist/digits_SVC.pkl']

In [6]:
parameters = {'C':10. ** np.arange(1,5)}
clf = grid_search.GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=1)

In [7]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:  1.4min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.2min finished
escape time :  218.246 s
best score is 0.891675391009
best parameter is {'C': 10.0}

In [8]:
joblib.dump(clf.best_estimator_, "./pkl/hog/mnist/digits_LinearSVC.pkl", compress=3)


Out[8]:
['./pkl/hog/mnist/digits_LinearSVC.pkl']

In [9]:
print "SEARCHING LOGISTIC REGRESSION"
params = {"C": [1.0, 10.0, 100.0]}
start = time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs =-1, verbose =1)
gs.fit(hog_features, labels)

# print diagnostic information to the user and grab the best model
print "done in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING LOGISTIC REGRESSION
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    4.6s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   14.5s finished
done in 19.910s
best score: 0.893
LOGISTIC REGRESSION PARAMETERS
	 C: 100.000000

In [11]:
clf = LogisticRegression(C = 100.0)
clf.fit(hog_features, labels)
joblib.dump(clf, "./pkl/hog/mnist/digits_LogisticRegression.pkl", compress=3)


Out[11]:
['./pkl/hog/mnist/digits_LogisticRegression.pkl']

In [12]:
# initialize the RBM + Logistic Regression pipeline
rbm = BernoulliRBM()
logistic = LogisticRegression()
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])

# perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
# C for Logistic Regression
print "SEARCHING RBM + LOGISTIC REGRESSION"
params = {
    "rbm__learning_rate": [0.1, 0.01, 0.001],
    "rbm__n_iter": [20, 40, 80],
    "rbm__n_components": [50, 100, 200],
    "logistic__C": [1.0, 10.0, 100.0]}

# perform a grid search over the parameter
start = time()
gs = GridSearchCV(classifier, params, n_jobs =-1, verbose =1)
gs.fit(hog_features, labels)

# print diagnostic information to the user and grab the best model
print "\ndone in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "RBM + LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING RBM + LOGISTIC REGRESSION
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 35.5min
[Parallel(n_jobs=-1)]: Done 237 out of 243 | elapsed: 42.8min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 45.7min finished
done in 2796.275s
best score: 0.793
RBM + LOGISTIC REGRESSION PARAMETERS
	 logistic__C: 100.000000
	 rbm__learning_rate: 0.001000
	 rbm__n_components: 200.000000
	 rbm__n_iter: 20.000000

In [ ]:
rbm = BernoulliRBM(n_components = 200, n_iter = 20, learning_rate = 0.001,  verbose = True)
logistic = LogisticRegression(C = 100.0)
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
classifier.fit(hog_features, labels)
joblib.dump(classifier, "./pkl/hog/mnist/digits_LogisticRegression_BernoulliRBM.pkl", compress=3)


[BernoulliRBM] Iteration 1, pseudo-likelihood = -7.17, time = 1.87s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -7.11, time = 2.93s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -7.13, time = 2.93s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -7.15, time = 2.92s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -7.08, time = 2.92s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -7.06, time = 2.91s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -7.07, time = 2.92s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -7.14, time = 2.92s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -7.12, time = 2.91s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -7.12, time = 2.91s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -7.14, time = 2.91s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -7.12, time = 2.91s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -7.08, time = 2.91s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -7.18, time = 2.90s
[BernoulliRBM] Iteration 15, pseudo-likelihood = -7.13, time = 2.91s