notebook.community

Edit and run



In [1]:

    
# Attempting to use Linear Regression for classification
# This means fitting a linear model to the probability of
# a certain class then use a function to create a threshold



In [2]:

    
# going to use the logistic function: 1/(1+ (e^-t))



In [33]:

    
from sklearn.datasets import make_classification



In [34]:

    
X, y = make_classification(n_samples=1000, n_features=4)



In [35]:

    
from sklearn.linear_model import LogisticRegression



In [36]:

    
lr = LogisticRegression()



In [37]:

    
# Since this is a random dataset, we can pull out the last 200
# records and use this as a test set. If we were using structured
# data, we wouldn't want to do this.

X_train = X[:-200]
X_test = X[-200:]
y_train = y[:-200]
y_test = y[-200:]



In [38]:

    
lr.fit(X_train, y_train)
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)



In [39]:

    
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]









    Out[39]:





0.95374999999999999



In [40]:

    
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]









    Out[40]:





0.96499999999999997



In [41]:

    
# this works when the data is pretty uniformly distributed
# next example uses a 95% imbalance in classification



In [42]:

    
X, y = make_classification(n_samples=1000, n_features=4,
                          weights=[.95])



In [43]:

    
sum(y) / (len(y)*1.) # make sure we have ~5% positive classes









    Out[43]:





0.057000000000000002



In [44]:

    
X_train = X[:-500]
X_test = X[-500:]
y_train = y[:-500]
y_test = y[-500:]



In [45]:

    
lr.fit(X_train, y_train)









    Out[45]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)



In [46]:

    
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)



In [47]:

    
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]









    Out[47]:





0.95799999999999996



In [48]:

    
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]









    Out[48]:





0.95999999999999996



In [49]:

    
(y_test_predictions[y_test==1] == y_test[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]









    Out[49]:





0.48275862068965519



In [50]:

    
# we need to adjust the weight we use to penalize false negatives



In [51]:

    
# since we know our dataset is 95% negative we can assign weights
lr = LogisticRegression(class_weight={0: .15, 1: .85})
lr.fit(X_train, y_train)









    Out[51]:





LogisticRegression(C=1.0, class_weight={0: 0.15, 1: 0.85}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)



In [52]:

    
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)



In [53]:

    
(y_train_predictions == y_train).sum().astype(float) / y_train_predictions.shape[0]









    Out[53]:





0.94399999999999995



In [54]:

    
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]









    Out[54]:





0.94599999999999995



In [55]:

    
(y_test_predictions[y_test==1] == y_test[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]









    Out[55]:





0.75862068965517238



In [ ]: