In [23]:
import numpy as np
from skimage.feature import hog
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from time import time

In [2]:
features = joblib.load("./mldata/features_1000.mat")
labels = joblib.load("./mldata/lables_1000.mat")

In [3]:
features = np.array(features, 'int16')
labels = np.array(labels, 'int')

In [4]:
t0 = time()
list_hog_fd = []
for feature in features:
    fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"


escape time :  4.292 s

In [5]:
parameters = {'alpha' : [1, 10, 100, 1000]}
clf = grid_search.GridSearchCV(BernoulliNB(), parameters, n_jobs=-1, verbose=1)

In [6]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished
escape time :  0.145 s
best score is 0.713
best parameter is {'alpha': 1}

In [7]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_BernoulliNB.pkl", compress=3)


Out[7]:
['./pkl/hog/skt/digits_BernoulliNB.pkl']

In [8]:
parameters = {'min_samples_split' : [2, 10, 15, 20], 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)

In [9]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 114 out of 120 | elapsed:   21.1s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   23.0s finished
escape time :  24.042 s
best score is 0.9322
best parameter is {'min_samples_split': 10, 'n_estimators': 70}

In [10]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_RandomForestClassifier.pkl", compress=3)


Out[10]:
['./pkl/hog/skt/digits_RandomForestClassifier.pkl']

In [11]:
parameters = {'n_neighbors' : [5, 10, 15, 20]}
clf = grid_search.GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, verbose=1)

In [12]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.7s finished
escape time :  0.859 s
best score is 0.9304
best parameter is {'n_neighbors': 5}

In [13]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_KNeighborsClassifier.pkl", compress=3)


Out[13]:
['./pkl/hog/skt/digits_KNeighborsClassifier.pkl']

In [14]:
parameters = {'min_samples_split' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
clf = grid_search.GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, verbose=1)

In [15]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  24 out of  30 | elapsed:    0.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.0s finished
escape time :  1.233 s
best score is 0.8904
best parameter is {'min_samples_split': 5}

In [16]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_DecisionTreeClassifier.pkl", compress=3)


Out[16]:
['./pkl/hog/skt/digits_DecisionTreeClassifier.pkl']

In [17]:
parameters = {'C':10. ** np.arange(1,5), 'gamma':2. ** np.arange(-5,-1)}
clf = grid_search.GridSearchCV(SVC(cache_size=1000, kernel="rbf"), parameters, n_jobs=-1, verbose=1)

In [18]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  42 out of  48 | elapsed:    7.8s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    8.6s finished
escape time :  9.293 s
best score is 0.9294
best parameter is {'C': 100.0, 'gamma': 0.125}

In [19]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_SVC.pkl", compress=3)


Out[19]:
['./pkl/hog/skt/digits_SVC.pkl']

In [26]:
parameters = {'C':10. ** np.arange(1,5)}
clf = grid_search.GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=1)

In [27]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    4.5s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    9.1s finished
escape time :  11.406 s
best score is 0.9174
best parameter is {'C': 100.0}

In [28]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_LinearSVC.pkl", compress=3)


Out[28]:
['./pkl/hog/skt/digits_LinearSVC.pkl']

In [20]:
print "SEARCHING LOGISTIC REGRESSION"
params = {"C": [1.0, 10.0, 100.0]}
start = time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs =-1, verbose =1)
gs.fit(hog_features, labels)

# print diagnostic information to the user and grab the best model
print "done in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING LOGISTIC REGRESSION
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    1.5s finished
done in 2.104s
best score: 0.920
LOGISTIC REGRESSION PARAMETERS
	 C: 100.000000

In [29]:
clf = LogisticRegression(C = 100.0)
clf.fit(hog_features, labels)
joblib.dump(clf, "./pkl/hog/skt/digits_LogisticRegression.pkl", compress=3)


Out[29]:
['./pkl/hog/skt/digits_LogisticRegression.pkl']

In [21]:
# initialize the RBM + Logistic Regression pipeline
rbm = BernoulliRBM()
logistic = LogisticRegression()
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])

# perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
# C for Logistic Regression
print "SEARCHING RBM + LOGISTIC REGRESSION"
params = {
    "rbm__learning_rate": [0.1, 0.01, 0.001],
    "rbm__n_iter": [20, 40, 80],
    "rbm__n_components": [50, 100, 200],
    "logistic__C": [1.0, 10.0, 100.0]}

# perform a grid search over the parameter
start = time()
gs = GridSearchCV(classifier, params, n_jobs =-1, verbose =1)
gs.fit(hog_features, labels)

# print diagnostic information to the user and grab the best model
print "\ndone in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "RBM + LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING RBM + LOGISTIC REGRESSION
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 237 out of 243 | elapsed: 11.6min remaining:   17.6s
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 12.4min finished
done in 749.621s
best score: 0.840
RBM + LOGISTIC REGRESSION PARAMETERS
	 logistic__C: 100.000000
	 rbm__learning_rate: 0.001000
	 rbm__n_components: 200.000000
	 rbm__n_iter: 20.000000

In [30]:
rbm = BernoulliRBM(n_components = 200, n_iter = 20, learning_rate = 0.001,  verbose = True)
logistic = LogisticRegression(C = 100.0)
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
classifier.fit(hog_features, labels)
joblib.dump(classifier, "./pkl/hog/skt/digits_LogisticRegression_BernoulliRBM.pkl", compress=3)


[BernoulliRBM] Iteration 1, pseudo-likelihood = -7.81, time = 0.32s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -7.36, time = 0.34s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -6.94, time = 0.34s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -7.01, time = 0.34s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -6.89, time = 0.34s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -6.86, time = 0.34s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -6.95, time = 0.35s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -6.64, time = 0.35s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -6.96, time = 0.34s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -6.94, time = 0.35s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -6.85, time = 0.34s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -6.62, time = 0.34s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -6.83, time = 0.34s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -6.97, time = 0.34s
[BernoulliRBM] Iteration 15, pseudo-likelihood = -6.95, time = 0.34s
[BernoulliRBM] Iteration 16, pseudo-likelihood = -6.87, time = 0.34s
[BernoulliRBM] Iteration 17, pseudo-likelihood = -6.83, time = 0.34s
[BernoulliRBM] Iteration 18, pseudo-likelihood = -6.95, time = 0.34s
[BernoulliRBM] Iteration 19, pseudo-likelihood = -6.96, time = 0.35s
[BernoulliRBM] Iteration 20, pseudo-likelihood = -6.86, time = 0.34s
Out[30]:
['./pkl/hog/skt/digits_LogisticRegression_BernoulliRBM.pkl']