In [1]:
import numpy as np
from skimage.feature import hog
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn import datasets
from time import time
In [2]:
dataset = datasets.fetch_mldata("MNIST Original", data_home=".")
In [3]:
features = np.array(dataset.data, 'int16')
labels = np.array(dataset.target, 'int')
In [4]:
# 9 label이 붙은 index찾아서 지우기
f = []
l = []
for label, i in zip(labels, range(labels.shape[0])):
if label != 9:
f.append(features[i])
l.append(labels[i])
features = np.array(f, 'int16')
labels = np.array(l, 'int')
In [5]:
t0 = time()
def scale(X, eps = 0.001):
# scale the data points s.t the columns of the feature space
# (i.e the predictors) are within the range [0, 1]
return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)
features = features.astype("float32")
features = scale(features)
print "escape time : ", round(time()-t0, 3), "s"
In [15]:
parameters = {'alpha' : [1, 10, 100, 1000]}
clf = grid_search.GridSearchCV(BernoulliNB(), parameters, n_jobs=-1, verbose=1)
In [16]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [17]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_BernoulliNB.pkl", compress=3)
Out[17]:
In [18]:
parameters = {'min_samples_split' : [2, 10, 15, 20], 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)
In [19]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [20]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_RandomForestClassifier.pkl", compress=3)
Out[20]:
In [21]:
parameters = {'n_neighbors' : [5, 10, 15, 20]}
clf = grid_search.GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, verbose=1)
In [22]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [23]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_KNeighborsClassifier.pkl", compress=3)
Out[23]:
In [24]:
parameters = {'min_samples_split' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
clf = grid_search.GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, verbose=1)
In [25]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [26]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_DecisionTreeClassifier.pkl", compress=3)
Out[26]:
In [27]:
parameters = {'C':10. ** np.arange(1,5), 'gamma':2. ** np.arange(-5,-1)}
clf = grid_search.GridSearchCV(SVC(cache_size=1000, kernel="rbf"), parameters, n_jobs=-1, verbose=1)
In [29]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [30]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_SVC.pkl", compress=3)
Out[30]:
In [6]:
parameters = {'C':10. ** np.arange(1,5)}
clf = grid_search.GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=1)
In [7]:
t0 = time()
clf.fit(features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [8]:
joblib.dump(clf.best_estimator_, "./pkl/scale/mnist/digits_LinearSVC.pkl", compress=3)
Out[8]:
In [9]:
print "SEARCHING LOGISTIC REGRESSION"
params = {"C": [1.0, 10.0, 100.0]}
start = time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs =-1, verbose =1)
gs.fit(features, labels)
# print diagnostic information to the user and grab the best model
print "done in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()
# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
print "\t %s: %f" % (p, bestParams[p])
In [11]:
clf = LogisticRegression(C = 1.0)
clf.fit(features, labels)
joblib.dump(clf, "./pkl/scale/mnist/digits_LogisticRegression.pkl", compress=3)
Out[11]:
In [12]:
# initialize the RBM + Logistic Regression pipeline
rbm = BernoulliRBM()
logistic = LogisticRegression()
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
# perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
# C for Logistic Regression
print "SEARCHING RBM + LOGISTIC REGRESSION"
params = {
"rbm__learning_rate": [0.1, 0.01, 0.001],
"rbm__n_iter": [20, 40, 80],
"rbm__n_components": [50, 100, 200],
"logistic__C": [1.0, 10.0, 100.0]}
# perform a grid search over the parameter
start = time()
gs = GridSearchCV(classifier, params, n_jobs =-1, verbose =1)
gs.fit(features, labels)
# print diagnostic information to the user and grab the best model
print "\ndone in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "RBM + LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()
# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
print "\t %s: %f" % (p, bestParams[p])
In [14]:
rbm = BernoulliRBM(n_components = 200, n_iter = 20, learning_rate = 0.01, verbose = True)
logistic = LogisticRegression(C = 10.0)
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
classifier.fit(features, labels)
joblib.dump(classifier, "./pkl/scale/mnist/digits_LogisticRegression_BernoulliRBM.pkl", compress=3)
Out[14]: