In [1]:
# Attempting to use Linear Regression for classification
# This means fitting a linear model to the probability of
# a certain class then use a function to create a threshold

In [2]:
# going to use the logistic function: 1/(1+ (e^-t))

In [33]:
from sklearn.datasets import make_classification

In [34]:
X, y = make_classification(n_samples=1000, n_features=4)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
lr = LogisticRegression()

In [37]:
# Since this is a random dataset, we can pull out the last 200
# records and use this as a test set. If we were using structured
# data, we wouldn't want to do this.

X_train = X[:-200]
X_test = X[-200:]
y_train = y[:-200]
y_test = y[-200:]

In [38]:
lr.fit(X_train, y_train)
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)

In [39]:
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]


Out[39]:
0.95374999999999999

In [40]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]


Out[40]:
0.96499999999999997

In [41]:
# this works when the data is pretty uniformly distributed
# next example uses a 95% imbalance in classification

In [42]:
X, y = make_classification(n_samples=1000, n_features=4,
                          weights=[.95])

In [43]:
sum(y) / (len(y)*1.) # make sure we have ~5% positive classes


Out[43]:
0.057000000000000002

In [44]:
X_train = X[:-500]
X_test = X[-500:]
y_train = y[:-500]
y_test = y[-500:]

In [45]:
lr.fit(X_train, y_train)


Out[45]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [46]:
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)

In [47]:
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]


Out[47]:
0.95799999999999996

In [48]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]


Out[48]:
0.95999999999999996

In [49]:
(y_test_predictions[y_test==1] == y_test[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]


Out[49]:
0.48275862068965519

In [50]:
# we need to adjust the weight we use to penalize false negatives

In [51]:
# since we know our dataset is 95% negative we can assign weights
lr = LogisticRegression(class_weight={0: .15, 1: .85})
lr.fit(X_train, y_train)


Out[51]:
LogisticRegression(C=1.0, class_weight={0: 0.15, 1: 0.85}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)

In [52]:
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)

In [53]:
(y_train_predictions == y_train).sum().astype(float) / y_train_predictions.shape[0]


Out[53]:
0.94399999999999995

In [54]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]


Out[54]:
0.94599999999999995

In [55]:
(y_test_predictions[y_test==1] == y_test[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]


Out[55]:
0.75862068965517238

In [ ]: