notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
# Performs and exhaustive search throuh scikit-learn using
# built-in methods. Also, walks through using randomized
# optimization as an alternative to brute force search.



In [3]:

    
from sklearn.datasets import make_classification



In [4]:

    
X, y = make_classification(1000, n_features=5)
from sklearn.linear_model import LogisticRegression



In [5]:

    
lr = LogisticRegression(class_weight='auto')



In [6]:

    
# Specify the parameters that we want to search. For GridSearch,
# we can just specify the ranges we care about, but for
# RandomizedSearchCV, we'll need to actually specify the
# distribution over the same space from which to sample.



In [7]:

    
lr.fit(X, y)









    Out[7]:





LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)



In [8]:

    
grid_search_params = {'penalty': ['l1', 'l2'],
                      'C': [1,2,3,4]}



In [9]:

    
import scipy.stats as st
import numpy as np



In [10]:

    
random_search_params = {'penalty': ['l1', 'l2'],
                        'C': st.randint(1,4)}



In [11]:

    
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV



In [12]:

    
gs = GridSearchCV(lr, grid_search_params)



In [13]:

    
gs.fit(X, y)









    Out[13]:





GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [14]:

    
# We want to find the optimal set of parameters. We can also look
# at the marginal performance of the grid search.



In [15]:

    
gs.grid_scores_









    Out[15]:





[mean: 0.95600, std: 0.00148, params: {'penalty': 'l1', 'C': 1},
 mean: 0.95600, std: 0.00148, params: {'penalty': 'l2', 'C': 1},
 mean: 0.95700, std: 0.00136, params: {'penalty': 'l1', 'C': 2},
 mean: 0.95600, std: 0.00148, params: {'penalty': 'l2', 'C': 2},
 mean: 0.95700, std: 0.00136, params: {'penalty': 'l1', 'C': 3},
 mean: 0.95600, std: 0.00148, params: {'penalty': 'l2', 'C': 3},
 mean: 0.95700, std: 0.00136, params: {'penalty': 'l1', 'C': 4},
 mean: 0.95600, std: 0.00148, params: {'penalty': 'l2', 'C': 4}]



In [16]:

    
# get the max score:
gs.grid_scores_[1][1]









    Out[16]:





0.95599999999999996



In [17]:

    
max(gs.grid_scores_, key=lambda x: x[1])









    Out[17]:





mean: 0.95700, std: 0.00136, params: {'penalty': 'l1', 'C': 2}



In [ ]:

    
# the choices obtained are the best choices for our logistic
# regression classifier.