In [23]:
import numpy as np
from skimage.feature import hog
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from time import time
In [2]:
features = joblib.load("./mldata/features_1000.mat")
labels = joblib.load("./mldata/lables_1000.mat")
In [3]:
features = np.array(features, 'int16')
labels = np.array(labels, 'int')
In [4]:
t0 = time()
list_hog_fd = []
for feature in features:
fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"
In [5]:
parameters = {'alpha' : [1, 10, 100, 1000]}
clf = grid_search.GridSearchCV(BernoulliNB(), parameters, n_jobs=-1, verbose=1)
In [6]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [7]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_BernoulliNB.pkl", compress=3)
Out[7]:
In [8]:
parameters = {'min_samples_split' : [2, 10, 15, 20], 'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)
In [9]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [10]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_RandomForestClassifier.pkl", compress=3)
Out[10]:
In [11]:
parameters = {'n_neighbors' : [5, 10, 15, 20]}
clf = grid_search.GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, verbose=1)
In [12]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [13]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_KNeighborsClassifier.pkl", compress=3)
Out[13]:
In [14]:
parameters = {'min_samples_split' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
clf = grid_search.GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, verbose=1)
In [15]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [16]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_DecisionTreeClassifier.pkl", compress=3)
Out[16]:
In [17]:
parameters = {'C':10. ** np.arange(1,5), 'gamma':2. ** np.arange(-5,-1)}
clf = grid_search.GridSearchCV(SVC(cache_size=1000, kernel="rbf"), parameters, n_jobs=-1, verbose=1)
In [18]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [19]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_SVC.pkl", compress=3)
Out[19]:
In [26]:
parameters = {'C':10. ** np.arange(1,5)}
clf = grid_search.GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=1)
In [27]:
t0 = time()
clf.fit(hog_features, labels)
print "escape time : ", round(time()-t0, 3), "s"
print "best score is %s" % clf.best_score_
print "best parameter is %s" % clf.best_params_
In [28]:
joblib.dump(clf.best_estimator_, "./pkl/hog/skt/digits_LinearSVC.pkl", compress=3)
Out[28]:
In [20]:
print "SEARCHING LOGISTIC REGRESSION"
params = {"C": [1.0, 10.0, 100.0]}
start = time()
gs = GridSearchCV(LogisticRegression(), params, n_jobs =-1, verbose =1)
gs.fit(hog_features, labels)
# print diagnostic information to the user and grab the best model
print "done in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()
# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
print "\t %s: %f" % (p, bestParams[p])
In [29]:
clf = LogisticRegression(C = 100.0)
clf.fit(hog_features, labels)
joblib.dump(clf, "./pkl/hog/skt/digits_LogisticRegression.pkl", compress=3)
Out[29]:
In [21]:
# initialize the RBM + Logistic Regression pipeline
rbm = BernoulliRBM()
logistic = LogisticRegression()
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
# perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
# C for Logistic Regression
print "SEARCHING RBM + LOGISTIC REGRESSION"
params = {
"rbm__learning_rate": [0.1, 0.01, 0.001],
"rbm__n_iter": [20, 40, 80],
"rbm__n_components": [50, 100, 200],
"logistic__C": [1.0, 10.0, 100.0]}
# perform a grid search over the parameter
start = time()
gs = GridSearchCV(classifier, params, n_jobs =-1, verbose =1)
gs.fit(hog_features, labels)
# print diagnostic information to the user and grab the best model
print "\ndone in %0.3fs" % (time() - start)
print "best score: %0.3f" % (gs.best_score_)
print "RBM + LOGISTIC REGRESSION PARAMETERS"
bestParams = gs.best_estimator_.get_params()
# loop over the parameters and print each of them out so they can be manually set
for p in sorted(params.keys()):
print "\t %s: %f" % (p, bestParams[p])
In [30]:
rbm = BernoulliRBM(n_components = 200, n_iter = 20, learning_rate = 0.001, verbose = True)
logistic = LogisticRegression(C = 100.0)
classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])
classifier.fit(hog_features, labels)
joblib.dump(classifier, "./pkl/hog/skt/digits_LogisticRegression_BernoulliRBM.pkl", compress=3)
Out[30]: