In [1]:
%matplotlib inline
import pickle
%run helper_loans.py
pd.options.display.max_columns = 1000
plt.rcParams["figure.figsize"] = (15,10)
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier

In [2]:
df = unpickle_object("clean dataframe.pkl")

In [3]:
df.shape


Out[3]:
(538008, 1926)

In [4]:
y = df['loan_status_Late'].values
df.drop('loan_status_Late', inplace=True, axis=1)
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)
# params = {'strategy': ["stratified", "most_frequent", "prior", "uniform", "constant"]}
model = DummyClassifier(strategy = "stratified", random_state=0)
model.fit(X_train,y_train)
model.score(X_test,y_test)


Out[4]:
0.61074184770486684

In [5]:
model2 = DummyClassifier(strategy = "most_frequent", random_state=0)
model2.fit(X_train,y_train)
model2.score(X_test,y_test)


Out[5]:
0.7374611529940075

In [6]:
model3 = DummyClassifier(strategy = "prior", random_state=0)
model3.fit(X_train,y_train)
model3.score(X_test,y_test)


Out[6]:
0.7374611529940075

In [7]:
model4 = DummyClassifier(strategy = "uniform", random_state=0)
model4.fit(X_train,y_train)
model4.score(X_test,y_test)


Out[7]:
0.49864686026973576

In [8]:
model5 = DummyClassifier(strategy = "constant", random_state=0, constant=0)
model5.fit(X_train,y_train)
model5.score(X_test,y_test)


Out[8]:
0.7374611529940075

From the above - we can see that constantly guessing "Not late" gives us 73% accuracy.

This is the same as using the most-frequent and prior parameters for our dumy classifier!

73% is the number to beat!