In [12]:
import sys
from sklearn.cross_validation import train_test_split
from MLPipeline.IO.config_parser import parse_configfile
from MLPipeline.IO.collect_classes import get_two_classes
from MLPipeline.gridsearch._tools import build_parameter_grid, grid_search
from sklearn.pipeline import Pipeline
import time

In [13]:
# Load the data and the labels
X, y = get_two_classes('../data/LSST/nonlens/*', '../data/LSST/lens/*')

In [27]:
# Split the dataset and labels into training and testing subsets (collect first 100, which are non-lens,
# and last hundred, which are lens)                                                
X_train, X_test, y_train, y_test = train_test_split( X[:100]+X[-100:], y[:100]+y[-100:], test_size=0.2 )

In [16]:
# Build the estimators and feed into the pipeline      
from sklearn.linear_model import LogisticRegression
from MLPipeline.features.feature_extraction import HOG
feature_selection = [('hog', HOG())]
classifier = [('logistic_regression', LogisticRegression())]

estimators = feature_selection+classifier

pipeline = Pipeline(estimators)

In [20]:
# Create the grid search with list of parameters                                                             
# to search                                                                                                  
param_grid = [{'hog__orientations' : (4, 5,),
                'hog__pixels_per_cell' : ((4, 4),),
                'hog__cells_per_block' : ((2, 2),),
                'logistic_regression__C' : (1.,10.),
                }]

In [23]:
# Compose a grid search from the extimators in the pipeline and the corresponding grid parameters
from sklearn.grid_search import GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid,
                               n_jobs = -1)

In [29]:
# Run the grid search on the training set
start_time = time.time()
grid_search.fit(X_train,y_train)
print "Time Taken: ", time.time() - start_time


Time Taken:  19.7791969776

In [30]:
# Take a look at the best score
grid_search.best_score_


Out[30]:
0.60624999999999996

In [33]:
# Take a look at the parameters that output the best score
grid_search.best_params_


Out[33]:
{'hog__cells_per_block': (2, 2),
 'hog__orientations': 4,
 'hog__pixels_per_cell': (4, 4),
 'logistic_regression__C': 1.0}

In [44]:
# Short function to print the confusion matrix for predicted scores
def confusion_matrix(predicted, actual):
    '''                                                                                                          
    | Outputs (what model thinks it is, what it is)                                                              
                                                                                                                 
    '''
    import numpy as np
    from collections import Counter
    predicted, actual = map(np.array, [predicted, actual])
    return Counter(zip(predicted, actual))

In [47]:
# Show score for training set and test set using best parameters                                                          
print 'Train score:', grid_search.score(X_train, y_train)
print 'Test score:', grid_search.score(X_test, y_test)


Train score: 0.91875
Train score: 0.6

In [48]:
# Show confusion matrix for test set using best parameters
print 'train matrix', confusion_matrix(grid_search.predict(X_train), y_train)
print 'test matrix', confusion_matrix(grid_search.predict(X_test), y_test)


train matrix Counter({(0, 0): 77, (1, 1): 70, (0, 1): 10, (1, 0): 3})
test matrix Counter({(1, 1): 13, (0, 0): 11, (1, 0): 9, (0, 1): 7})

In [ ]: