In [1]:
# Attempting to use Linear Regression for classification
# This means fitting a linear model to the probability of
# a certain class then use a function to create a threshold
In [2]:
# going to use the logistic function: 1/(1+ (e^-t))
In [33]:
from sklearn.datasets import make_classification
In [34]:
X, y = make_classification(n_samples=1000, n_features=4)
In [35]:
from sklearn.linear_model import LogisticRegression
In [36]:
lr = LogisticRegression()
In [37]:
# Since this is a random dataset, we can pull out the last 200
# records and use this as a test set. If we were using structured
# data, we wouldn't want to do this.
X_train = X[:-200]
X_test = X[-200:]
y_train = y[:-200]
y_test = y[-200:]
In [38]:
lr.fit(X_train, y_train)
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)
In [39]:
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]
Out[39]:
In [40]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]
Out[40]:
In [41]:
# this works when the data is pretty uniformly distributed
# next example uses a 95% imbalance in classification
In [42]:
X, y = make_classification(n_samples=1000, n_features=4,
weights=[.95])
In [43]:
sum(y) / (len(y)*1.) # make sure we have ~5% positive classes
Out[43]:
In [44]:
X_train = X[:-500]
X_test = X[-500:]
y_train = y[:-500]
y_test = y[-500:]
In [45]:
lr.fit(X_train, y_train)
Out[45]:
In [46]:
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)
In [47]:
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]
Out[47]:
In [48]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]
Out[48]:
In [49]:
(y_test_predictions[y_test==1] == y_test[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]
Out[49]:
In [50]:
# we need to adjust the weight we use to penalize false negatives
In [51]:
# since we know our dataset is 95% negative we can assign weights
lr = LogisticRegression(class_weight={0: .15, 1: .85})
lr.fit(X_train, y_train)
Out[51]:
In [52]:
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)
In [53]:
(y_train_predictions == y_train).sum().astype(float) / y_train_predictions.shape[0]
Out[53]:
In [54]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]
Out[54]:
In [55]:
(y_test_predictions[y_test==1] == y_test[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]
Out[55]:
In [ ]: