In [8]:
%matplotlib inline
import pickle
%run helper_functions.py
%run s3.py
pd.options.display.max_columns = 1000
plt.rcParams["figure.figsize"] = (15,10)
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
import psycopg2
import os

This notebook will select the top 35 features from out dataset.

I will rescale the resulting columns - while I am keenly aware this makes no difference to the Random Forest Model, I am just doing it for consistency.

I also pickle the scaler as we will be using this in our flask web app to transform the input data.


In [ ]:
df = unpickle_object("dummied_dataset.pkl")

In [ ]:
df.shape

In [ ]:
#this logic will be important for flask data entry.

float_columns = df.select_dtypes(include=['float64']).columns

for col in float_columns:
    if "mths" not in col:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        if col == "inq_last_6mths":
            df[col].fillna(0, inplace=True)
        elif col == "mths_since_last_delinq":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_last_record":
            df[col].fillna(999, inplace=True)
        elif col == "collections_12_mths_ex_med":
            df[col].fillna(0, inplace=True)
        elif col == "mths_since_last_major_derog":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_rcnt_il":
            df[col].fillna(999, inplace=True)
        elif col == "acc_open_past_24mths":
            df[col].fillna(0, inplace=True)
        elif col == "chargeoff_within_12_mths":
            df[col].fillna(0, inplace=True)
        elif col == "mths_since_recent_bc":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_recent_bc_dlq":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_recent_inq":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_recent_revol_delinq":
            df[col].fillna(999, inplace=True)

In [ ]:
top_35 = ["int_rate", 
          "dti", 
          "term_ 60 months",
          "bc_open_to_buy",
          "revol_util",
          "installment",
          "avg_cur_bal",
          "tot_hi_cred_lim",
          "revol_bal",
          "funded_amnt_inv",
          "bc_util",
          "tot_cur_bal",
          "total_bc_limit",
          "total_rev_hi_lim",
          "funded_amnt",
          "loan_amnt",
          "mo_sin_old_rev_tl_op",
          "total_bal_ex_mort",
          "issue_d_Dec-2016",
          "total_acc",
          "mo_sin_old_il_acct",
          "mths_since_recent_bc",
          "total_il_high_credit_limit",
          "inq_last_6mths",
          "acc_open_past_24mths",
          "mo_sin_rcnt_tl",
          "mo_sin_rcnt_rev_tl_op",
          "percent_bc_gt_75",
          "num_rev_accts",
          "mths_since_last_delinq",
          "open_acc",
          "mths_since_recent_inq",
          "grade_B",
          "num_bc_tl",
          "loan_status_Late"]

In [ ]:
df_reduced_features = df.loc[:, top_35]

In [ ]:
df_reduced_features.shape

In [ ]:
scaler = StandardScaler()
matrix_df = df_reduced_features.as_matrix()
matrix = scaler.fit_transform(matrix_df)
scaled_df = pd.DataFrame(matrix, columns=df_reduced_features.columns)

In [ ]:
scaler = StandardScaler()
matrix_df = df_reduced_features.as_matrix()
scalar_object_35 = scaler.fit(matrix_df)
matrix = scalar_object_35.transform(matrix_df)
scaled_df_35 = pd.DataFrame(matrix, columns=df_reduced_features.columns)

In [ ]:
check = scaled_df_35 == scaled_df # lets pickle the scaler

In [ ]:
check.head()

In [ ]:
pickle_object(scalar_object_35, "scaler_35_features")

In [ ]:
pickle_object(scaled_df, "rf_df_35")

In [ ]:
upload_to_bucket('rf_df_35.pkl', "rf_df_35.pkl","gabr-project-3")

In [ ]:
upload_to_bucket("scaler_35_features.pkl", "scaler_35_features.pkl", "gabr-project-3")

In [2]:
df = unpickle_object("rf_df_35.pkl")

In [5]:
engine = create_engine(os.environ["PSQL_CONN"])

In [7]:
df.to_sql("dummied_dataset", con=engine)

BELOW WE DIRECTLY QUERY THE DATABASE BELOW: Nothing has to be held in memory again!


In [10]:
pd.read_sql_query('''SELECT * FROM dummied_dataset LIMIT 5''', engine)


Out[10]:
index int_rate dti term_ 60 months bc_open_to_buy revol_util installment avg_cur_bal tot_hi_cred_lim revol_bal funded_amnt_inv bc_util tot_cur_bal total_bc_limit total_rev_hi_lim funded_amnt loan_amnt mo_sin_old_rev_tl_op total_bal_ex_mort issue_d_Dec-2016 total_acc mo_sin_old_il_acct mths_since_recent_bc total_il_high_credit_limit inq_last_6mths acc_open_past_24mths mo_sin_rcnt_tl mo_sin_rcnt_rev_tl_op percent_bc_gt_75 num_rev_accts mths_since_last_delinq open_acc mths_since_recent_inq grade_B num_bc_tl loan_status_Late
0 0 -0.691239 0.316533 -0.572832 -0.336618 1.217037 -1.076647 -0.344728 -0.323194 -0.107344 -1.088801 0.149284 -0.354923 -0.290113 -0.215951 -1.095558 -1.096804 -0.173567 -0.242634 -0.113402 -1.373327 0.046177 2.9681 -0.221848 0.195522 -1.305392 -0.292118 -0.277218 0.028544 -0.135032 0.951804 -1.611450 2.180953 1.584982 -0.114309 -0.598253
1 1 0.332064 -0.526679 1.745714 -0.336618 -1.824751 -1.484993 -0.344728 -0.323194 -0.669430 -1.382859 0.149284 -0.354923 -0.290113 -0.215951 -1.393251 -1.394124 -0.173567 -0.242634 -0.113402 -1.792454 0.046177 2.9681 -0.221848 3.933241 -1.305392 -0.292118 -0.277218 0.028544 -0.135032 0.951804 -1.611450 2.180953 -0.630922 -0.114309 1.671534
2 2 0.484895 -0.282416 -0.572832 -0.336618 1.822939 -1.387900 -0.344728 -0.323194 -0.609796 -1.394741 0.149284 -0.354923 -0.290113 -0.215951 -1.405159 -1.406017 -0.173567 -0.242634 -0.113402 -1.289501 0.046177 2.9681 -0.221848 1.129952 -1.305392 -0.292118 -0.277218 0.028544 -0.135032 0.951804 -1.804259 2.180953 -0.630922 -0.114309 -0.598253
3 3 -0.062196 0.074485 -0.572832 -0.336618 -1.349855 -0.377418 -0.344728 -0.323194 -0.485640 -0.491774 0.149284 -0.354923 -0.290113 -0.215951 -0.500173 -0.502163 -0.173567 -0.242634 -0.113402 0.973787 0.046177 2.9681 -0.221848 0.195522 -1.305392 -0.292118 -0.277218 0.028544 -0.135032 -1.048563 -0.261787 2.180953 -0.630922 -0.114309 -0.598253
4 4 -0.239391 0.009306 1.745714 -0.336618 -0.002953 -1.453448 -0.344728 -0.323194 0.556906 -1.323454 0.149284 -0.354923 -0.290113 -0.215951 -1.333712 -1.334660 -0.173567 -0.242634 -0.113402 1.057612 0.046177 2.9681 -0.221848 -0.738908 -1.305392 -0.292118 -0.277218 0.028544 -0.135032 -1.042337 0.702258 2.180953 1.584982 -0.114309 -0.598253

In [ ]: