In [12]:
import sys
from sklearn.cross_validation import train_test_split
from MLPipeline.IO.config_parser import parse_configfile
from MLPipeline.IO.collect_classes import get_two_classes
from MLPipeline.gridsearch._tools import build_parameter_grid, grid_search
from sklearn.pipeline import Pipeline
import time
In [13]:
# Load the data and the labels
X, y = get_two_classes('../data/LSST/nonlens/*', '../data/LSST/lens/*')
In [27]:
# Split the dataset and labels into training and testing subsets (collect first 100, which are non-lens,
# and last hundred, which are lens)
X_train, X_test, y_train, y_test = train_test_split( X[:100]+X[-100:], y[:100]+y[-100:], test_size=0.2 )
In [16]:
# Build the estimators and feed into the pipeline
from sklearn.linear_model import LogisticRegression
from MLPipeline.features.feature_extraction import HOG
feature_selection = [('hog', HOG())]
classifier = [('logistic_regression', LogisticRegression())]
estimators = feature_selection+classifier
pipeline = Pipeline(estimators)
In [20]:
# Create the grid search with list of parameters
# to search
param_grid = [{'hog__orientations' : (4, 5,),
'hog__pixels_per_cell' : ((4, 4),),
'hog__cells_per_block' : ((2, 2),),
'logistic_regression__C' : (1.,10.),
}]
In [23]:
# Compose a grid search from the extimators in the pipeline and the corresponding grid parameters
from sklearn.grid_search import GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid,
n_jobs = -1)
In [29]:
# Run the grid search on the training set
start_time = time.time()
grid_search.fit(X_train,y_train)
print "Time Taken: ", time.time() - start_time
In [30]:
# Take a look at the best score
grid_search.best_score_
Out[30]:
In [33]:
# Take a look at the parameters that output the best score
grid_search.best_params_
Out[33]:
In [44]:
# Short function to print the confusion matrix for predicted scores
def confusion_matrix(predicted, actual):
'''
| Outputs (what model thinks it is, what it is)
'''
import numpy as np
from collections import Counter
predicted, actual = map(np.array, [predicted, actual])
return Counter(zip(predicted, actual))
In [47]:
# Show score for training set and test set using best parameters
print 'Train score:', grid_search.score(X_train, y_train)
print 'Test score:', grid_search.score(X_test, y_test)
In [48]:
# Show confusion matrix for test set using best parameters
print 'train matrix', confusion_matrix(grid_search.predict(X_train), y_train)
print 'test matrix', confusion_matrix(grid_search.predict(X_test), y_test)
In [ ]: