notebook.community

Edit and run



In [8]:

    
%matplotlib inline
import pickle
%run helper_functions.py
%run s3.py
pd.options.display.max_columns = 1000
plt.rcParams["figure.figsize"] = (15,10)
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
import psycopg2
import os

This notebook will select the top 35 features from out dataset.

I will rescale the resulting columns - while I am keenly aware this makes no difference to the Random Forest Model, I am just doing it for consistency.

I also pickle the scaler as we will be using this in our flask web app to transform the input data.



In [ ]:

    
df = unpickle_object("dummied_dataset.pkl")



In [ ]:

    
df.shape



In [ ]:

    
#this logic will be important for flask data entry.

float_columns = df.select_dtypes(include=['float64']).columns

for col in float_columns:
    if "mths" not in col:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        if col == "inq_last_6mths":
            df[col].fillna(0, inplace=True)
        elif col == "mths_since_last_delinq":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_last_record":
            df[col].fillna(999, inplace=True)
        elif col == "collections_12_mths_ex_med":
            df[col].fillna(0, inplace=True)
        elif col == "mths_since_last_major_derog":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_rcnt_il":
            df[col].fillna(999, inplace=True)
        elif col == "acc_open_past_24mths":
            df[col].fillna(0, inplace=True)
        elif col == "chargeoff_within_12_mths":
            df[col].fillna(0, inplace=True)
        elif col == "mths_since_recent_bc":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_recent_bc_dlq":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_recent_inq":
            df[col].fillna(999, inplace=True)
        elif col == "mths_since_recent_revol_delinq":
            df[col].fillna(999, inplace=True)



In [ ]:

    
top_35 = ["int_rate", 
          "dti", 
          "term_ 60 months",
          "bc_open_to_buy",
          "revol_util",
          "installment",
          "avg_cur_bal",
          "tot_hi_cred_lim",
          "revol_bal",
          "funded_amnt_inv",
          "bc_util",
          "tot_cur_bal",
          "total_bc_limit",
          "total_rev_hi_lim",
          "funded_amnt",
          "loan_amnt",
          "mo_sin_old_rev_tl_op",
          "total_bal_ex_mort",
          "issue_d_Dec-2016",
          "total_acc",
          "mo_sin_old_il_acct",
          "mths_since_recent_bc",
          "total_il_high_credit_limit",
          "inq_last_6mths",
          "acc_open_past_24mths",
          "mo_sin_rcnt_tl",
          "mo_sin_rcnt_rev_tl_op",
          "percent_bc_gt_75",
          "num_rev_accts",
          "mths_since_last_delinq",
          "open_acc",
          "mths_since_recent_inq",
          "grade_B",
          "num_bc_tl",
          "loan_status_Late"]



In [ ]:

    
df_reduced_features = df.loc[:, top_35]



In [ ]:

    
df_reduced_features.shape



In [ ]:

    
scaler = StandardScaler()
matrix_df = df_reduced_features.as_matrix()
matrix = scaler.fit_transform(matrix_df)
scaled_df = pd.DataFrame(matrix, columns=df_reduced_features.columns)



In [ ]:

    
scaler = StandardScaler()
matrix_df = df_reduced_features.as_matrix()
scalar_object_35 = scaler.fit(matrix_df)
matrix = scalar_object_35.transform(matrix_df)
scaled_df_35 = pd.DataFrame(matrix, columns=df_reduced_features.columns)



In [ ]:

    
check = scaled_df_35 == scaled_df # lets pickle the scaler



In [ ]:

    
check.head()



In [ ]:

    
pickle_object(scalar_object_35, "scaler_35_features")



In [ ]:

    
pickle_object(scaled_df, "rf_df_35")



In [ ]:

    
upload_to_bucket('rf_df_35.pkl', "rf_df_35.pkl","gabr-project-3")



In [ ]:

    
upload_to_bucket("scaler_35_features.pkl", "scaler_35_features.pkl", "gabr-project-3")



In [2]:

    
df = unpickle_object("rf_df_35.pkl")



In [5]:

    
engine = create_engine(os.environ["PSQL_CONN"])



In [7]:

    
df.to_sql("dummied_dataset", con=engine)

BELOW WE DIRECTLY QUERY THE DATABASE BELOW: Nothing has to be held in memory again!



In [10]:

    
pd.read_sql_query('''SELECT * FROM dummied_dataset LIMIT 5''', engine)









    Out[10]:







  
    
      
      index
      int_rate
      dti
      term_ 60 months
      bc_open_to_buy
      revol_util
      installment
      avg_cur_bal
      tot_hi_cred_lim
      revol_bal
      funded_amnt_inv
      bc_util
      tot_cur_bal
      total_bc_limit
      total_rev_hi_lim
      funded_amnt
      loan_amnt
      mo_sin_old_rev_tl_op
      total_bal_ex_mort
      issue_d_Dec-2016
      total_acc
      mo_sin_old_il_acct
      mths_since_recent_bc
      total_il_high_credit_limit
      inq_last_6mths
      acc_open_past_24mths
      mo_sin_rcnt_tl
      mo_sin_rcnt_rev_tl_op
      percent_bc_gt_75
      num_rev_accts
      mths_since_last_delinq
      open_acc
      mths_since_recent_inq
      grade_B
      num_bc_tl
      loan_status_Late
    
  
  
    
      0
      0
      -0.691239
      0.316533
      -0.572832
      -0.336618
      1.217037
      -1.076647
      -0.344728
      -0.323194
      -0.107344
      -1.088801
      0.149284
      -0.354923
      -0.290113
      -0.215951
      -1.095558
      -1.096804
      -0.173567
      -0.242634
      -0.113402
      -1.373327
      0.046177
      2.9681
      -0.221848
      0.195522
      -1.305392
      -0.292118
      -0.277218
      0.028544
      -0.135032
      0.951804
      -1.611450
      2.180953
      1.584982
      -0.114309
      -0.598253
    
    
      1
      1
      0.332064
      -0.526679
      1.745714
      -0.336618
      -1.824751
      -1.484993
      -0.344728
      -0.323194
      -0.669430
      -1.382859
      0.149284
      -0.354923
      -0.290113
      -0.215951
      -1.393251
      -1.394124
      -0.173567
      -0.242634
      -0.113402
      -1.792454
      0.046177
      2.9681
      -0.221848
      3.933241
      -1.305392
      -0.292118
      -0.277218
      0.028544
      -0.135032
      0.951804
      -1.611450
      2.180953
      -0.630922
      -0.114309
      1.671534
    
    
      2
      2
      0.484895
      -0.282416
      -0.572832
      -0.336618
      1.822939
      -1.387900
      -0.344728
      -0.323194
      -0.609796
      -1.394741
      0.149284
      -0.354923
      -0.290113
      -0.215951
      -1.405159
      -1.406017
      -0.173567
      -0.242634
      -0.113402
      -1.289501
      0.046177
      2.9681
      -0.221848
      1.129952
      -1.305392
      -0.292118
      -0.277218
      0.028544
      -0.135032
      0.951804
      -1.804259
      2.180953
      -0.630922
      -0.114309
      -0.598253
    
    
      3
      3
      -0.062196
      0.074485
      -0.572832
      -0.336618
      -1.349855
      -0.377418
      -0.344728
      -0.323194
      -0.485640
      -0.491774
      0.149284
      -0.354923
      -0.290113
      -0.215951
      -0.500173
      -0.502163
      -0.173567
      -0.242634
      -0.113402
      0.973787
      0.046177
      2.9681
      -0.221848
      0.195522
      -1.305392
      -0.292118
      -0.277218
      0.028544
      -0.135032
      -1.048563
      -0.261787
      2.180953
      -0.630922
      -0.114309
      -0.598253
    
    
      4
      4
      -0.239391
      0.009306
      1.745714
      -0.336618
      -0.002953
      -1.453448
      -0.344728
      -0.323194
      0.556906
      -1.323454
      0.149284
      -0.354923
      -0.290113
      -0.215951
      -1.333712
      -1.334660
      -0.173567
      -0.242634
      -0.113402
      1.057612
      0.046177
      2.9681
      -0.221848
      -0.738908
      -1.305392
      -0.292118
      -0.277218
      0.028544
      -0.135032
      -1.042337
      0.702258
      2.180953
      1.584982
      -0.114309
      -0.598253



In [ ]:

	index	int_rate	dti	term_ 60 months	bc_open_to_buy	revol_util	installment	avg_cur_bal	tot_hi_cred_lim	revol_bal	funded_amnt_inv	bc_util	tot_cur_bal	total_bc_limit	total_rev_hi_lim	funded_amnt	loan_amnt	mo_sin_old_rev_tl_op	total_bal_ex_mort	issue_d_Dec-2016	total_acc	mo_sin_old_il_acct	mths_since_recent_bc	total_il_high_credit_limit	inq_last_6mths	acc_open_past_24mths	mo_sin_rcnt_tl	mo_sin_rcnt_rev_tl_op	percent_bc_gt_75	num_rev_accts	mths_since_last_delinq	open_acc	mths_since_recent_inq	grade_B	num_bc_tl	loan_status_Late
0	0	-0.691239	0.316533	-0.572832	-0.336618	1.217037	-1.076647	-0.344728	-0.323194	-0.107344	-1.088801	0.149284	-0.354923	-0.290113	-0.215951	-1.095558	-1.096804	-0.173567	-0.242634	-0.113402	-1.373327	0.046177	2.9681	-0.221848	0.195522	-1.305392	-0.292118	-0.277218	0.028544	-0.135032	0.951804	-1.611450	2.180953	1.584982	-0.114309	-0.598253
1	1	0.332064	-0.526679	1.745714	-0.336618	-1.824751	-1.484993	-0.344728	-0.323194	-0.669430	-1.382859	0.149284	-0.354923	-0.290113	-0.215951	-1.393251	-1.394124	-0.173567	-0.242634	-0.113402	-1.792454	0.046177	2.9681	-0.221848	3.933241	-1.305392	-0.292118	-0.277218	0.028544	-0.135032	0.951804	-1.611450	2.180953	-0.630922	-0.114309	1.671534
2	2	0.484895	-0.282416	-0.572832	-0.336618	1.822939	-1.387900	-0.344728	-0.323194	-0.609796	-1.394741	0.149284	-0.354923	-0.290113	-0.215951	-1.405159	-1.406017	-0.173567	-0.242634	-0.113402	-1.289501	0.046177	2.9681	-0.221848	1.129952	-1.305392	-0.292118	-0.277218	0.028544	-0.135032	0.951804	-1.804259	2.180953	-0.630922	-0.114309	-0.598253
3	3	-0.062196	0.074485	-0.572832	-0.336618	-1.349855	-0.377418	-0.344728	-0.323194	-0.485640	-0.491774	0.149284	-0.354923	-0.290113	-0.215951	-0.500173	-0.502163	-0.173567	-0.242634	-0.113402	0.973787	0.046177	2.9681	-0.221848	0.195522	-1.305392	-0.292118	-0.277218	0.028544	-0.135032	-1.048563	-0.261787	2.180953	-0.630922	-0.114309	-0.598253
4	4	-0.239391	0.009306	1.745714	-0.336618	-0.002953	-1.453448	-0.344728	-0.323194	0.556906	-1.323454	0.149284	-0.354923	-0.290113	-0.215951	-1.333712	-1.334660	-0.173567	-0.242634	-0.113402	1.057612	0.046177	2.9681	-0.221848	-0.738908	-1.305392	-0.292118	-0.277218	0.028544	-0.135032	-1.042337	0.702258	2.180953	1.584982	-0.114309	-0.598253