SVM Focus on Best Parameters with GridSearchCV

Importing Modules


In [2]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
# from sklearn.grid_search import GridSearchCV

Run Variables Setup If Necessary


In [4]:
if 'features_train' or 'features_train_small' not in locals() or globals():
    %run ../dev/environment_setup2.ipynb


The nltk version is 3.2.4.
The scikit-learn version is 0.18.1.
The matplotlib version is 1.5.0.
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-4-960dcd2505b6> in <module>()
----> 1 features_train, features_test, labels_train, labels_test = preprocess()

<ipython-input-4-b9e2aa63b0f4> in preprocess(words_file, authors_file)
    171     selector = SelectPercentile(f_classif, percentile=10)
    172     selector.fit(features_train_transformed, labels_train)
--> 173     features_train_transformed = selector.transform(features_train_transformed).toarray()
    174     features_test_transformed  = selector.transform(features_test_transformed).toarray()
    175 

/opt/ds/local/lib/python2.7/site-packages/scipy/sparse/compressed.pyc in toarray(self, order, out)
    947     def toarray(self, order=None, out=None):
    948         """See the docstring for `spmatrix.toarray`."""
--> 949         return self.tocoo(copy=False).toarray(order=order, out=out)
    950 
    951     ##############################################################

/opt/ds/local/lib/python2.7/site-packages/scipy/sparse/coo.pyc in toarray(self, order, out)
    272     def toarray(self, order=None, out=None):
    273         """See the docstring for `spmatrix.toarray`."""
--> 274         B = self._process_toarray_args(order, out)
    275         fortran = int(B.flags.f_contiguous)
    276         if not fortran and not B.flags.c_contiguous:

/opt/ds/local/lib/python2.7/site-packages/scipy/sparse/base.pyc in _process_toarray_args(self, order, out)
    798             return out
    799         else:
--> 800             return np.zeros(self.shape, dtype=self.dtype, order=order)
    801 
    802     def __numpy_ufunc__(self, func, method, pos, inputs, **kwargs):

MemoryError: 

Reduced Variables DataSet (1%)


In [ ]:
features_train_small = features_train[:len(features_train)/100]
labels_train_small = labels_train[:len(labels_train)/100]

Load SVM Classifier with C Parameter: Low Value


In [ ]:
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
svr = svm.SVC() 
clf = GridSearchCV(svr, parameters)

Train and Predict


In [ ]:
grid_train_predict("SVM with GridSearchCV and Reduced Dataset...")
sorted(clf.cv_results_.keys())
param = "Best Param: " +  str(clf.best_params_)
print (param)
score = "Best Avarage Score: " + str(clf.best_score_)
print (score)