In [1]:
import numpy as np
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from time import time

In [8]:
features = joblib.load("./mldata/features_1000.mat")
labels = joblib.load("./mldata/lables_1000.mat")

In [13]:
features = np.array(features, 'int16')
labels = np.array(labels, 'int')

In [10]:
t0 = time()
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)

features = features.astype("float32")
features = scale(features)

print "escape time : ", round(time()-t0, 3), "s"


escape time :  0.048 s

In [11]:
parameters = {'alpha' : [1, 10, 100, 1000]}
clf = grid_search.GridSearchCV(BernoulliNB(), parameters, n_jobs=-1, verbose=1)

In [12]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    1.1s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.6s finished
escape time :  1.89 s
best score is 0.9064
best parameter is {'alpha': 1}

In [8]:
joblib.dump(clf.best_estimator_, "./pkl/scale/skt/digits_BernoulliNB.pkl", compress=3)


Out[8]:
['./pkl/scale/skt/digits_BernoulliNB.pkl']

In [9]:
parameters = {'min_samples_split' : [2, 10, 15, 20], 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)

In [10]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 114 out of 120 | elapsed:   19.1s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   20.6s finished
escape time :  21.818 s
best score is 0.936
best parameter is {'min_samples_split': 20, 'n_estimators': 80}

In [11]:
joblib.dump(clf.best_estimator_, "./pkl/scale/skt/digits_RandomForestClassifier.pkl", compress=3)


Out[11]:
['./pkl/scale/skt/digits_RandomForestClassifier.pkl']

In [12]:
parameters = {'n_neighbors' : [5, 10, 15, 20]}
clf = grid_search.GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, verbose=1)

In [13]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   17.0s finished
escape time :  17.32 s
best score is 0.9322
best parameter is {'n_neighbors': 5}

In [14]:
joblib.dump(clf.best_estimator_, "./pkl/scale/skt/digits_KNeighborsClassifier.pkl", compress=3)


Out[14]:
['./pkl/scale/skt/digits_KNeighborsClassifier.pkl']

In [15]:
parameters = {'min_samples_split' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
clf = grid_search.GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, verbose=1)

In [16]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  24 out of  30 | elapsed:    3.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.9s finished
escape time :  4.575 s
best score is 0.9156
best parameter is {'min_samples_split': 50}

In [17]:
joblib.dump(clf.best_estimator_, "./pkl/scale/skt/digits_DecisionTreeClassifier.pkl", compress=3)


Out[17]:
['./pkl/scale/skt/digits_DecisionTreeClassifier.pkl']

In [22]:
parameters = {'C':10. ** np.arange(1,5), 'gamma':2. ** np.arange(-5,-1)}
clf = grid_search.GridSearchCV(SVC(cache_size=1000, kernel="rbf"), parameters, n_jobs=-1, verbose=1)

In [23]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  42 out of  48 | elapsed:  2.5min remaining:   21.4s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.0min finished
escape time :  184.668 s
best score is 0.9294
best parameter is {'C': 10.0, 'gamma': 0.03125}

In [24]:
joblib.dump(clf.best_estimator_, "./pkl/scale/skt/digits_SVC.pkl", compress=3)


Out[24]:
['./pkl/scale/skt/digits_SVC.pkl']

In [5]:
parameters = {'C':10. ** np.arange(1,5)}
clf = grid_search.GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=1)

In [6]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:   16.7s remaining:   16.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   28.2s finished
escape time :  36.171 s
best score is 0.9286
best parameter is {'C': 10.0}

In [7]:
joblib.dump(clf.best_estimator_, "./pkl/scale/skt/digits_LinearSVC.pkl", compress=3)


Out[7]:
['./pkl/scale/skt/digits_LinearSVC.pkl']

In [8]:
print "SEARCHING LOGISTIC REGRESSION"
params = {"C": [1.0, 10.0, 100.0]}
start = time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs =-1, verbose =1)
gs.fit(features, labels)

# print diagnostic information to the user and grab the best model
print "done in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING LOGISTIC REGRESSION
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Done   1 out of   9 | elapsed:   11.3s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   38.1s finished
done in 45.699s
best score: 0.934
LOGISTIC REGRESSION PARAMETERS
	 C: 1.000000

In [9]:
clf = LogisticRegression(C = 1.0)
clf.fit(features, labels)
joblib.dump(clf, "./pkl/scale/skt/digits_LogisticRegression.pkl", compress=3)


Out[9]:
['./pkl/scale/skt/digits_LogisticRegression.pkl']

In [10]:
# initialize the RBM + Logistic Regression pipeline
rbm = BernoulliRBM()
logistic = LogisticRegression()
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])

# perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
# C for Logistic Regression
print "SEARCHING RBM + LOGISTIC REGRESSION"
params = {
    "rbm__learning_rate": [0.1, 0.01, 0.001],
    "rbm__n_iter": [20, 40, 80],
    "rbm__n_components": [50, 100, 200],
    "logistic__C": [1.0, 10.0, 100.0]}

# perform a grid search over the parameter
start = time()
gs = GridSearchCV(classifier, params, n_jobs =-1, verbose =1)
gs.fit(features, labels)

# print diagnostic information to the user and grab the best model
print "\ndone in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "RBM + LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()

# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
    print "\t %s: %f" % (p, bestParams[p])


SEARCHING RBM + LOGISTIC REGRESSION
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 237 out of 243 | elapsed: 45.4min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 48.1min finished
done in 2972.549s
best score: 0.931
RBM + LOGISTIC REGRESSION PARAMETERS
	 logistic__C: 10.000000
	 rbm__learning_rate: 0.001000
	 rbm__n_components: 200.000000
	 rbm__n_iter: 80.000000

In [11]:
rbm = BernoulliRBM(n_components = 200, n_iter = 80, learning_rate = 0.001,  verbose = True)
logistic = LogisticRegression(C = 10.0)
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
classifier.fit(features, labels)
joblib.dump(classifier, "./pkl/scale/skt/digits_LogisticRegression_BernoulliRBM.pkl", compress=3)


[BernoulliRBM] Iteration 1, pseudo-likelihood = -417.94, time = 1.05s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -377.82, time = 1.36s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -338.93, time = 1.37s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -310.25, time = 1.38s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -276.89, time = 1.35s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -264.55, time = 1.41s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -247.88, time = 1.38s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -228.54, time = 1.43s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -227.45, time = 1.39s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -218.69, time = 1.43s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -211.19, time = 1.46s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -200.61, time = 1.46s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -200.05, time = 1.38s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -193.19, time = 1.38s
[BernoulliRBM] Iteration 15, pseudo-likelihood = -190.02, time = 1.37s
[BernoulliRBM] Iteration 16, pseudo-likelihood = -193.10, time = 1.44s
[BernoulliRBM] Iteration 17, pseudo-likelihood = -186.28, time = 1.37s
[BernoulliRBM] Iteration 18, pseudo-likelihood = -184.43, time = 1.37s
[BernoulliRBM] Iteration 19, pseudo-likelihood = -183.86, time = 1.37s
[BernoulliRBM] Iteration 20, pseudo-likelihood = -188.20, time = 1.44s
[BernoulliRBM] Iteration 21, pseudo-likelihood = -182.01, time = 1.43s
[BernoulliRBM] Iteration 22, pseudo-likelihood = -177.20, time = 1.44s
[BernoulliRBM] Iteration 23, pseudo-likelihood = -176.59, time = 1.44s
[BernoulliRBM] Iteration 24, pseudo-likelihood = -166.86, time = 1.43s
[BernoulliRBM] Iteration 25, pseudo-likelihood = -174.05, time = 1.44s
[BernoulliRBM] Iteration 26, pseudo-likelihood = -171.33, time = 1.37s
[BernoulliRBM] Iteration 27, pseudo-likelihood = -173.45, time = 1.37s
[BernoulliRBM] Iteration 28, pseudo-likelihood = -168.41, time = 1.44s
[BernoulliRBM] Iteration 29, pseudo-likelihood = -174.86, time = 1.37s
[BernoulliRBM] Iteration 30, pseudo-likelihood = -167.01, time = 1.37s
[BernoulliRBM] Iteration 31, pseudo-likelihood = -159.57, time = 1.44s
[BernoulliRBM] Iteration 32, pseudo-likelihood = -165.59, time = 1.43s
[BernoulliRBM] Iteration 33, pseudo-likelihood = -162.88, time = 1.42s
[BernoulliRBM] Iteration 34, pseudo-likelihood = -158.61, time = 1.37s
[BernoulliRBM] Iteration 35, pseudo-likelihood = -158.25, time = 1.42s
[BernoulliRBM] Iteration 36, pseudo-likelihood = -162.04, time = 1.37s
[BernoulliRBM] Iteration 37, pseudo-likelihood = -159.94, time = 1.37s
[BernoulliRBM] Iteration 38, pseudo-likelihood = -159.02, time = 1.37s
[BernoulliRBM] Iteration 39, pseudo-likelihood = -162.79, time = 1.37s
[BernoulliRBM] Iteration 40, pseudo-likelihood = -150.59, time = 1.41s
[BernoulliRBM] Iteration 41, pseudo-likelihood = -150.58, time = 1.38s
[BernoulliRBM] Iteration 42, pseudo-likelihood = -156.68, time = 1.36s
[BernoulliRBM] Iteration 43, pseudo-likelihood = -156.14, time = 1.42s
[BernoulliRBM] Iteration 44, pseudo-likelihood = -152.29, time = 1.37s
[BernoulliRBM] Iteration 45, pseudo-likelihood = -153.23, time = 1.37s
[BernoulliRBM] Iteration 46, pseudo-likelihood = -146.05, time = 1.43s
[BernoulliRBM] Iteration 47, pseudo-likelihood = -151.96, time = 1.42s
[BernoulliRBM] Iteration 48, pseudo-likelihood = -147.78, time = 1.38s
[BernoulliRBM] Iteration 49, pseudo-likelihood = -155.29, time = 1.36s
[BernoulliRBM] Iteration 50, pseudo-likelihood = -151.02, time = 1.37s
[BernoulliRBM] Iteration 51, pseudo-likelihood = -151.62, time = 1.41s
[BernoulliRBM] Iteration 52, pseudo-likelihood = -152.07, time = 1.38s
[BernoulliRBM] Iteration 53, pseudo-likelihood = -144.10, time = 1.43s
[BernoulliRBM] Iteration 54, pseudo-likelihood = -150.93, time = 1.38s
[BernoulliRBM] Iteration 55, pseudo-likelihood = -151.12, time = 1.43s
[BernoulliRBM] Iteration 56, pseudo-likelihood = -145.38, time = 1.38s
[BernoulliRBM] Iteration 57, pseudo-likelihood = -143.98, time = 1.38s
[BernoulliRBM] Iteration 58, pseudo-likelihood = -146.27, time = 1.36s
[BernoulliRBM] Iteration 59, pseudo-likelihood = -140.59, time = 1.41s
[BernoulliRBM] Iteration 60, pseudo-likelihood = -146.46, time = 1.37s
[BernoulliRBM] Iteration 61, pseudo-likelihood = -143.89, time = 1.37s
[BernoulliRBM] Iteration 62, pseudo-likelihood = -139.94, time = 1.36s
[BernoulliRBM] Iteration 63, pseudo-likelihood = -143.03, time = 1.37s
[BernoulliRBM] Iteration 64, pseudo-likelihood = -146.00, time = 1.36s
[BernoulliRBM] Iteration 65, pseudo-likelihood = -147.95, time = 1.37s
[BernoulliRBM] Iteration 66, pseudo-likelihood = -141.58, time = 1.36s
[BernoulliRBM] Iteration 67, pseudo-likelihood = -146.40, time = 1.36s
[BernoulliRBM] Iteration 68, pseudo-likelihood = -137.26, time = 1.37s
[BernoulliRBM] Iteration 69, pseudo-likelihood = -136.86, time = 1.36s
[BernoulliRBM] Iteration 70, pseudo-likelihood = -138.60, time = 1.37s
[BernoulliRBM] Iteration 71, pseudo-likelihood = -133.55, time = 1.37s
[BernoulliRBM] Iteration 72, pseudo-likelihood = -136.69, time = 1.38s
[BernoulliRBM] Iteration 73, pseudo-likelihood = -139.28, time = 1.37s
[BernoulliRBM] Iteration 74, pseudo-likelihood = -140.79, time = 1.38s
[BernoulliRBM] Iteration 75, pseudo-likelihood = -139.05, time = 1.37s
[BernoulliRBM] Iteration 76, pseudo-likelihood = -140.91, time = 1.36s
[BernoulliRBM] Iteration 77, pseudo-likelihood = -136.29, time = 1.37s
[BernoulliRBM] Iteration 78, pseudo-likelihood = -135.48, time = 1.36s
[BernoulliRBM] Iteration 79, pseudo-likelihood = -139.56, time = 1.37s
[BernoulliRBM] Iteration 80, pseudo-likelihood = -128.30, time = 1.36s
Out[11]:
['./pkl/scale/skt/digits_LogisticRegression_BernoulliRBM.pkl']