In [1]:
import numpy as np
import pickle
from os import listdir
from time import time
from sklearn import grid_search, metrics
from sklearn.ensemble import RandomForestClassifier

# load data
im_data = pickle.load( open( "extracted_features.p", "rb" ) )
X = im_data[0] # extracted features
Y = im_data[1] # target
num_classes = len(np.unique(Y)) # number of classes
nc = X.shape[1]

# optimize number of features and perform cross-validation
parameters = {'max_features':map(lambda x: int(x), [np.ceil(np.sqrt(nc)), np.ceil(nc/3.0), np.ceil(nc/1.5)]),
              'n_estimators':[50], 'compute_importances':[True], 'n_jobs':[1]}
rf_opt = grid_search.GridSearchCV(RandomForestClassifier(), parameters,\
                                   score_func=metrics.accuracy_score, n_jobs = 1, cv = 10)
before_GS = time()
rf_opt.fit(X, Y)
after_GS = time()
best_rf = rf_opt.best_estimator_
pickle.dump( best_rf, open( "trained_classifier.p", "wb" ) ) # save the trained classifier
print("Optimized, trained, classifier has been pickled under the name 'trained_classifier.p'\n")

# ------------------- Output -------------------
class feature_priority(object):
    def __init__(self, priorities, dec=4):
        self.feature_names = ['pixel count','avg red','avg green','avg blue','avg lum',
                              'median lum','std lum','median red','median green','median blue',
                              'std red','std green','std blue','avg lum v-edges','median lum v-edges',
                              'std lum v-edges','avg lum h-edges','median lum h-edges','std h-edges',
                              '>thresh h-edges','>thresh v-edges','aspect ratio','image peaks']
        self.feature_priorities = map(lambda x: np.round(x, decimals=dec), priorities)
        self.sorted_feats = sorted(zip(self.feature_names,self.feature_priorities), key = lambda x: x[1], reverse=True)
    
    def __str__(self):
        outstr = "Features (most to least important): \n"
        for (i,j) in self.sorted_feats:
            outstr += "\t" + i + " "*(25-len(i)) + "relative importance: " + str(j) + "\n"
        return outstr
        
print("Time to run grid search: {0:.3f} sec.  Average of {1:.4f} sec per classifier fit/predict cycle (per parameter combo per CV-fold)\n"\
      .format(after_GS-before_GS, (after_GS-before_GS)/(len(rf_opt.grid_scores_)*rf_opt.cv)))
print("Best score: {0:.2f}% accuracy, vs. random guessing at: {1:.2f}%, for a factor of {2:.1f}X improvement." \
      .format(100*rf_opt.best_score_, 100.0/num_classes, rf_opt.best_score_*num_classes))
print("Best Parameters:" + str(rf_opt.best_params_) + "\n")

feats = feature_priority(best_rf.feature_importances_)
print str(feats)
print "Grid Search Scores:" 
for score in rf_opt.grid_scores_:
    print score


Optimized, trained, classifier has been pickled under the name 'trained_classifier.p'

Time to run grid search: 149.923 sec.  Average of 4.9974 sec per classifier fit/predict cycle (per parameter combo per CV-fold)

Best score: 31.08% accuracy, vs. random guessing at: 2.00%, for a factor of 15.5X improvement.
Best Parameters:{'compute_importances': True, 'max_features': 16, 'n_jobs': 1, 'n_estimators': 50}

Features (most to least important): 
	aspect ratio             relative importance: 0.154
	pixel count              relative importance: 0.0852
	image peaks              relative importance: 0.044
	std red                  relative importance: 0.0432
	std lum v-edges          relative importance: 0.0414
	>thresh v-edges          relative importance: 0.0407
	std blue                 relative importance: 0.0397
	std h-edges              relative importance: 0.0386
	>thresh h-edges          relative importance: 0.0386
	avg red                  relative importance: 0.0384
	median lum v-edges       relative importance: 0.0384
	std green                relative importance: 0.0371
	avg lum v-edges          relative importance: 0.0371
	avg blue                 relative importance: 0.0357
	avg green                relative importance: 0.0353
	avg lum h-edges          relative importance: 0.0345
	median red               relative importance: 0.0336
	median lum h-edges       relative importance: 0.0336
	median blue              relative importance: 0.032
	median green             relative importance: 0.0317
	std lum                  relative importance: 0.031
	median lum               relative importance: 0.0287
	avg lum                  relative importance: 0.0274

Grid Search Scores:
mean: 0.29760, std: 0.01536, params: {'compute_importances': True, 'max_features': 5, 'n_jobs': 1, 'n_estimators': 50}
mean: 0.30561, std: 0.01219, params: {'compute_importances': True, 'max_features': 8, 'n_jobs': 1, 'n_estimators': 50}
mean: 0.31079, std: 0.01305, params: {'compute_importances': True, 'max_features': 16, 'n_jobs': 1, 'n_estimators': 50}
//anaconda/python.app/Contents/lib/python2.7/site-packages/sklearn/grid_search.py:466: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.
  self.loss_func, self.score_func, self.scoring)

In [ ]: