In [2]:
# Homework 2

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import stats
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC as svc
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV as gs
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier

%matplotlib inline

In [6]:
column_names=[['A'+str(i) for i in range(1,17)]
data = pd.read_csv('./crx.data', names=column_names)


  File "<ipython-input-6-4fa65fb1b246>", line 2
    data = pd.read_csv('./crx.data', names=column_names)
       ^
SyntaxError: invalid syntax

In [7]:
data.head()


Out[7]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [8]:
data.describe()


Out[8]:
A3 A8 A11 A15
count 690.000000 690.000000 690.00000 690.000000
mean 4.758725 2.223406 2.40000 1017.385507
std 4.978163 3.346513 4.86294 5210.102598
min 0.000000 0.000000 0.00000 0.000000
25% 1.000000 0.165000 0.00000 0.000000
50% 2.750000 1.000000 0.00000 5.000000
75% 7.207500 2.625000 3.00000 395.500000
max 28.000000 28.500000 67.00000 100000.000000

In [9]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [10]:
# Ok. So now we've got our data, and have taken a look at it. 
# I really wish this stuff wasn't so masked out, it feels like gibberish.

with open('./crx.names') as f:
    lines=f.readlines()
for line in lines:
    print line



1. Title: Credit Approval



2. Sources: 

    (confidential)

    Submitted by quinlan@cs.su.oz.au



3.  Past Usage:



    See Quinlan,

    * "Simplifying decision trees", Int J Man-Machine Studies 27,

      Dec 1987, pp. 221-234.

    * "C4.5: Programs for Machine Learning", Morgan Kaufmann, Oct 1992

  

4.  Relevant Information:



    This file concerns credit card applications.  All attribute names

    and values have been changed to meaningless symbols to protect

    confidentiality of the data.

  

    This dataset is interesting because there is a good mix of

    attributes -- continuous, nominal with small numbers of

    values, and nominal with larger numbers of values.  There

    are also a few missing values.

  

5.  Number of Instances: 690



6.  Number of Attributes: 15 + class attribute



7.  Attribute Information:



    A1:	b, a.

    A2:	continuous.

    A3:	continuous.

    A4:	u, y, l, t.

    A5:	g, p, gg.

    A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.

    A7:	v, h, bb, j, n, z, dd, ff, o.

    A8:	continuous.

    A9:	t, f.

    A10:	t, f.

    A11:	continuous.

    A12:	t, f.

    A13:	g, p, s.

    A14:	continuous.

    A15:	continuous.

    A16: +,-         (class attribute)



8.  Missing Attribute Values:

    37 cases (5%) have one or more missing values.  The missing

    values from particular attributes are:



    A1:  12

    A2:  12

    A4:   6

    A5:   6

    A6:   9

    A7:   9

    A14: 13



9.  Class Distribution

  

    +: 307 (44.5%)

    -: 383 (55.5%)




In [11]:
# Does data.info() make the correct columns continuous?
# we need 2, 3, 8, 11, 14, 15
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

Mean substitution

An old procedure that should certainly be relegated to the past was the idea of substituting a mean for the missing data. For example, if you don't know my systolic blood pressure, just substitute the mean systolic blood pressure for mine and continue. There are a couple of problems with this approach. In the first place it adds no new information. The overall mean, with or without replacing my missing data, will be the same. In addition, such a process leads to an underestimate of error. Cohen et al. (2003) gave an interesting example of a data set on university faculty. The data consisted of data on salary and citation level of publications. There were 62 cases with complete data and 7 cases for which the citation index was missing. Cohen gives the following table.

N r b St. Err. b Analysis

62 .55 310.747 60.95 Complete cases

69 .54 310.747 59.13 Mean substitution

Notice that using mean substitution makes only a trivial change in the correlation coefficient and no change in the regression coefficient. But the st. err (b) is noticeably smaller using mean substitution. That should not be surprising. We have really added no new information to the data but we have increased the sample size. The effect of increasing the sample size is to increase the denominator for computing the standard error, thus reducing the standard error. Adding no new information certainly should not make you more comfortable with the result, but this would seem to. The reduction is spurious and should be avoided--as we'll see below.

So I was reading on best methods for imputation and I came to a bypass... I didn't find a super good source of information on best practices. I sort of wondered how good I could do if I just got rid of the data with missing values? Since we areonly missing 5%, which to me seems like very little, I've proceeded to do the rest of the homework and dropping the rows with missing data.


In [12]:
data.replace('?', np.NaN, inplace=True)
data[['A2', 'A14']] = data[['A2', 'A14']].astype(float)
data.info()

what_if_i_want_to_impute_later = data.copy()

# We're assuming the data is missing at random
clean = data.dropna(axis=0)
clean.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     678 non-null object
A2     678 non-null float64
A3     690 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     681 non-null object
A7     681 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    677 non-null float64
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 91.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 653 entries, 0 to 689
Data columns (total 16 columns):
A1     653 non-null object
A2     653 non-null float64
A3     653 non-null float64
A4     653 non-null object
A5     653 non-null object
A6     653 non-null object
A7     653 non-null object
A8     653 non-null float64
A9     653 non-null object
A10    653 non-null object
A11    653 non-null int64
A12    653 non-null object
A13    653 non-null object
A14    653 non-null float64
A15    653 non-null int64
A16    653 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 86.7+ KB

In [13]:
data.describe()


Out[13]:
A2 A3 A8 A11 A14 A15
count 678.000000 690.000000 690.000000 690.00000 677.000000 690.000000
mean 31.568171 4.758725 2.223406 2.40000 184.014771 1017.385507
std 11.957862 4.978163 3.346513 4.86294 173.806768 5210.102598
min 13.750000 0.000000 0.000000 0.00000 0.000000 0.000000
25% 22.602500 1.000000 0.165000 0.00000 75.000000 0.000000
50% 28.460000 2.750000 1.000000 0.00000 160.000000 5.000000
75% 38.230000 7.207500 2.625000 3.00000 276.000000 395.500000
max 80.250000 28.000000 28.500000 67.00000 2000.000000 100000.000000

In [14]:
clean.describe()


Out[14]:
A2 A3 A8 A11 A14 A15
count 653.000000 653.000000 653.000000 653.000000 653.000000 653.000000
mean 31.503813 4.829533 2.244296 2.502297 180.359877 1013.761103
std 11.838267 5.027077 3.371120 4.968497 168.296811 5253.278504
min 13.750000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 22.580000 1.040000 0.165000 0.000000 73.000000 0.000000
50% 28.420000 2.835000 1.000000 0.000000 160.000000 5.000000
75% 38.250000 7.500000 2.625000 3.000000 272.000000 400.000000
max 76.750000 28.000000 28.500000 67.000000 2000.000000 100000.000000

In [15]:
# what difference does cleaning make ? 
clean.describe() - data.describe()

# not that much


Out[15]:
A2 A3 A8 A11 A14 A15
count -25.000000 -37.000000 -37.000000 -37.000000 -24.000000 -37.000000
mean -0.064358 0.070808 0.020890 0.102297 -3.654894 -3.624405
std -0.119595 0.048914 0.024607 0.105557 -5.509957 43.175905
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% -0.022500 0.040000 0.000000 0.000000 -2.000000 0.000000
50% -0.040000 0.085000 0.000000 0.000000 0.000000 0.000000
75% 0.020000 0.292500 0.000000 0.000000 -4.000000 4.500000
max -3.500000 0.000000 0.000000 0.000000 0.000000 0.000000

In [16]:
# But what if it isn't random ? are the missing values correlated with something?
dirty = data[pd.isnull(data).any(axis=1)]

In [17]:
dirty.corr()


Out[17]:
A2 A3 A8 A11 A14 A15
A2 1.000000 -0.141654 -0.096642 -0.269982 -0.044375 -0.208973
A3 -0.141654 1.000000 0.227406 0.211036 -0.366915 0.229068
A8 -0.096642 0.227406 1.000000 0.092876 -0.375809 0.033493
A11 -0.269982 0.211036 0.092876 1.000000 -0.142303 0.545660
A14 -0.044375 -0.366915 -0.375809 -0.142303 1.000000 -0.019799
A15 -0.208973 0.229068 0.033493 0.545660 -0.019799 1.000000

In [18]:
# A11 and A14 are the only possibly correlated.

In [19]:
print stats.pearsonr(data['A11'], data['A15'])
print stats.pearsonr(clean['A11'], clean['A15'])


(0.063692439330212028, 0.094578025231347326)
(0.058407562463740265, 0.13597400501151491)

In [20]:
# Basically we can see here that removing the data doesn't change the correlation much or anything.

In [21]:
from pandas import scatter_matrix
scatter_matrix(clean)


Out[21]:
array([[<matplotlib.axes.AxesSubplot object at 0x108a5e650>,
        <matplotlib.axes.AxesSubplot object at 0x108a18710>,
        <matplotlib.axes.AxesSubplot object at 0x108a8fe10>,
        <matplotlib.axes.AxesSubplot object at 0x108abccd0>,
        <matplotlib.axes.AxesSubplot object at 0x108ba6390>,
        <matplotlib.axes.AxesSubplot object at 0x108bcfe10>],
       [<matplotlib.axes.AxesSubplot object at 0x108c084d0>,
        <matplotlib.axes.AxesSubplot object at 0x108c31f50>,
        <matplotlib.axes.AxesSubplot object at 0x108c6d610>,
        <matplotlib.axes.AxesSubplot object at 0x108c9a0d0>,
        <matplotlib.axes.AxesSubplot object at 0x108cd1750>,
        <matplotlib.axes.AxesSubplot object at 0x108cff210>],
       [<matplotlib.axes.AxesSubplot object at 0x108d35890>,
        <matplotlib.axes.AxesSubplot object at 0x108d62350>,
        <matplotlib.axes.AxesSubplot object at 0x108d989d0>,
        <matplotlib.axes.AxesSubplot object at 0x108dc6490>,
        <matplotlib.axes.AxesSubplot object at 0x108dfdb10>,
        <matplotlib.axes.AxesSubplot object at 0x108e2a5d0>],
       [<matplotlib.axes.AxesSubplot object at 0x108e61c50>,
        <matplotlib.axes.AxesSubplot object at 0x108e8f710>,
        <matplotlib.axes.AxesSubplot object at 0x108ec5d90>,
        <matplotlib.axes.AxesSubplot object at 0x108817b50>,
        <matplotlib.axes.AxesSubplot object at 0x108f00cd0>,
        <matplotlib.axes.AxesSubplot object at 0x108f320d0>],
       [<matplotlib.axes.AxesSubplot object at 0x108f696d0>,
        <matplotlib.axes.AxesSubplot object at 0x108f96a90>,
        <matplotlib.axes.AxesSubplot object at 0x108fd1290>,
        <matplotlib.axes.AxesSubplot object at 0x108ffd190>,
        <matplotlib.axes.AxesSubplot object at 0x109033a10>,
        <matplotlib.axes.AxesSubplot object at 0x109062850>],
       [<matplotlib.axes.AxesSubplot object at 0x10909b110>,
        <matplotlib.axes.AxesSubplot object at 0x1090c5f10>,
        <matplotlib.axes.AxesSubplot object at 0x1091007d0>,
        <matplotlib.axes.AxesSubplot object at 0x10912f610>,
        <matplotlib.axes.AxesSubplot object at 0x109165e90>,
        <matplotlib.axes.AxesSubplot object at 0x109195cd0>]], dtype=object)

In [22]:
# change those plus and minus signs
clean.replace('+', '1', axis=1, inplace=True)
clean.replace('-', '0', axis=1, inplace=True)

clean['A16'].describe()


/Library/Python/2.7/site-packages/pandas-0.15.0_93_ga0ac41d-py2.7-macosx-10.9-intel.egg/pandas/core/generic.py:2488: UserWarning: the "axis" argument is deprecated and will be removed inv0.13; this argument has no effect
  warn('the "axis" argument is deprecated and will be removed in'
-c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-c:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[22]:
count     653
unique      2
top         0
freq      357
Name: A16, dtype: object

In [23]:
approval = clean['A16']
data = clean.drop('A16', 1)
data = pd.get_dummies(data)
data.head()


Out[23]:
A2 A3 A8 A11 A14 A15 A1_a A1_b A4_l A4_u ... A7_z A9_f A9_t A10_f A10_t A12_f A12_t A13_g A13_p A13_s
0 30.83 0.000 1.25 1 202 0 0 1 0 1 ... 0 0 1 0 1 1 0 1 0 0
1 58.67 4.460 3.04 6 43 560 1 0 0 1 ... 0 0 1 0 1 1 0 1 0 0
2 24.50 0.500 1.50 0 280 824 1 0 0 1 ... 0 0 1 1 0 1 0 1 0 0
3 27.83 1.540 3.75 5 100 3 0 1 0 1 ... 0 0 1 0 1 0 1 1 0 0
4 20.17 5.625 1.71 0 120 0 0 1 0 1 ... 0 0 1 1 0 1 0 0 0 1

5 rows × 46 columns


In [24]:
x_train, x_test, y_train, y_test = train_test_split(data, approval, test_size=0.30)

In [25]:
# Logistc
log_est = LR()
print log_est.fit(x_train, y_train).score(x_test, y_test)
# Damn, thats pretty good.
y_pred = log_est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm


0.852040816327
/Library/Python/2.7/site-packages/numpy-1.9.1-py2.7-macosx-10.9-intel.egg/numpy/core/fromnumeric.py:2499: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.
  VisibleDeprecationWarning)
[[88 20]
 [ 9 79]]

In [26]:
# Linear SVM
est = svc()
est.fit(x_train, y_train)
d = {'C': np.logspace(-3., 3., 10)}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
linear_results = g.best_estimator_, g.best_params_, g.best_score_; linear_results


Out[26]:
(LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
      random_state=None, tol=0.0001, verbose=0),
 {'C': 0.001},
 0.80962800875273522)

In [27]:
# Non linear SVM
est = SVC()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10),
'gamma': np.logspace(-3., 3., 10),
'kernel' : ['sigmoid', 'rbf'],
    } # I actually added a few more here but some took too long so I left them out.
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
nonlinear_results = g.best_estimator_, g.best_params_, g.best_score_

In [28]:
nonlinear_results


Out[28]:
(SVC(C=46.415888336127729, cache_size=200, class_weight=None, coef0=0.0,
   degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
   random_state=None, shrinking=True, tol=0.001, verbose=False),
 {'C': 46.415888336127729, 'gamma': 0.001, 'kernel': 'rbf'},
 0.67177242888402622)

In [29]:
# Hmm. Let's try some more
est = SGD()
est.fit(x_train, y_train)
d = {
'loss': ['log', 'perceptron', 'huber', 'epsilon_insensitive'],
'penalty': ['l1', 'elasticnet', 'l2'],
'alpha' : np.logspace(-3., 3., 10),
'epsilon' : np.logspace(-3., 3., 10)
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
sgd_results = g.best_estimator_, g.best_params_, g.best_score_

In [30]:
sgd_results


Out[30]:
(SGDClassifier(alpha=1000.0, class_weight=None, epsilon=0.0046415888336127772,
        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
        learning_rate='optimal', loss='huber', n_iter=5, n_jobs=1,
        penalty='l2', power_t=0.5, random_state=None, shuffle=False,
        verbose=0, warm_start=False),
 {'alpha': 1000.0,
  'epsilon': 0.0046415888336127772,
  'loss': 'huber',
  'penalty': 'l2'},
 0.6827133479212254)

In [31]:
print linear_results[2]
print nonlinear_results[2]
print sgd_results[2]

# It looks like the logistic regression did best actually...


0.809628008753
0.671772428884
0.682713347921

In [32]:
# Linear
est = svc()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10),
'tol': np.logspace(-3., 3., 10),
'dual': [True, False]
    }
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
linear_results = g.best_estimator_, g.best_params_, g.best_score_; linear_results


Out[32]:
(LinearSVC(C=46.415888336127729, class_weight=None, dual=False,
      fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
      penalty='l2', random_state=None, tol=0.001, verbose=0),
 {'C': 46.415888336127729, 'dual': False, 'tol': 0.001},
 0.87089715536105028)

In [33]:
# Adding tol / dual made a notable difference.

# Linear
est = svc()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10),
'tol': np.logspace(-3., 3., 10),
'intercept_scaling': np.logspace(-3., 3., 10)
    }
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
linear_results = g.best_estimator_, g.best_params_, g.best_score_; linear_results


Out[33]:
(LinearSVC(C=0.46415888336127775, class_weight=None, dual=True,
      fit_intercept=True, intercept_scaling=0.021544346900318832, loss='l2',
      multi_class='ovr', penalty='l2', random_state=None,
      tol=0.10000000000000001, verbose=0),
 {'C': 0.46415888336127775,
  'intercept_scaling': 0.021544346900318832,
  'tol': 0.10000000000000001},
 0.86870897155361049)

In [34]:
# Ok, intercept_scaling is whatever.
# Our best params: 'C': 0.021544346900318832, 'dual': False, 'tol': 0.001
est = svc(C=0.021544346900318832, dual=False, tol=0.001)
est.fit(x_train, y_train)
print est.score(x_test, y_test)
y_pred = est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print cm
print classification_report(y_test, y_pred)


0.852040816327
[[86 22]
 [ 7 81]]
             precision    recall  f1-score   support

          0       0.92      0.80      0.86       108
          1       0.79      0.92      0.85        88

avg / total       0.86      0.85      0.85       196


In [35]:
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()



In [36]:
current_palette = sns.color_palette()
sns.palplot(current_palette)
sns.palplot(sns.color_palette("GnBu"))



In [37]:
# Pretty time !

In [38]:
plt.imshow(cm, cmap='GnBu', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm


[[86 22]
 [ 7 81]]

In [39]:
# Let's use a dummy classifier just to check and see whats up.
dummy_test1 = DummyClassifier(strategy="stratified").fit(x_train, y_train).score(x_test, y_test)
dummy_test2 = DummyClassifier(strategy="most_frequent").fit(x_train, y_train).score(x_test, y_test)
dummy_test3 = DummyClassifier(strategy="uniform").fit(x_train, y_train).score(x_test, y_test)
print dummy_test1, dummy_test2, dummy_test3, est.score(x_test, y_test)


0.505102040816 0.551020408163 0.489795918367 0.852040816327

In [40]:
# Look's like we are doing alright.

In [41]:
# SO overall it looks like logistic regression seemed to do the best, even without adjusting parameters.

In [42]:
est = LR()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10)
    }
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
log_results = g.best_estimator_, g.best_params_, g.best_score_

In [43]:
log_results


Out[43]:
(LogisticRegression(C=2.154434690031882, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, penalty='l2',
           random_state=None, tol=0.0001),
 {'C': 2.154434690031882},
 0.86870897155361049)

In [44]:
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm


[[86 22]
 [ 7 81]]

In [46]:
# Logistic # NO PARAMETERS ADJUSTED
log_est = LR()
print log_est.fit(x_train, y_train).score(x_test, y_test)
# Damn, thats pretty good.
y_pred = log_est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm


0.852040816327
[[88 20]
 [ 9 79]]

In [74]:
# increase interval... by factor... of 10 !
est = LR()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 100)
    }
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
log_results = g.best_estimator_, g.best_params_, g.best_score_; log_results


Out[74]:
(LogisticRegression(C=0.065793322465756823, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, penalty='l2',
           random_state=None, tol=0.0001),
 {'C': 0.065793322465756823},
 0.87964989059080967)

In [75]:
# Ok. 300 intervals, with further min / max
# increase interval... by factor... of 10 !
est = LR()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-4., 4., 20),
'tol': np.logspace(-4., 4., 20)
    }
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
log_results = g.best_estimator_, g.best_params_, g.best_score_; log_results


Out[75]:
(LogisticRegression(C=3792.6901907322458, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, penalty='l2',
           random_state=None, tol=0.00026366508987303583),
 {'C': 3792.6901907322458, 'tol': 0.00026366508987303583},
 0.87746170678336977)

In [51]:
# So basically you should just leave C at 1.0 evidently, haha.

Conclusion

Logistic regression was the best. These were my parameters:

(LogisticRegression(C=0.065793322465756823, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, penalty='l2',
           random_state=None, tol=0.0001),
 {'C': 0.065793322465756823},
 0.87964989059080967)

In [73]:
# Logistic # NO PARAMETERS ADJUSTED
log_est = LR()
print log_est.fit(x_train, y_train).score(x_test, y_test)
# Damn, thats pretty good.
y_pred = log_est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm


0.862244897959
[[89 19]
 [ 8 80]]

In [ ]: