In [1]:
%matplotlib inline
import pickle
%run helper_loans.py
pd.options.display.max_columns = 1000
plt.rcParams["figure.figsize"] = (15,10)
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
In [2]:
df = unpickle_object("dummied_dataset.pkl")
In [3]:
df.shape
Out[3]:
I only have 23 completely clean rows in my dataframe. As such, fancier imputation methods like, using a Random Forest are out of the questions.
I will have to use human logic to figure out how to best impute all of my features of type float64.
Since I am dealing with financial data - I will impute with regards to the median.
For those columns that relate to the months since something occured, I will impute based on human logic of the category.
Example:
Months since last deliquency, if NaN, then I will assume the individual has never been deliquent, thus I wil assign a value of 999.
If Number of charge-offs within 12 months is NaN - I will assume this individual has never had a charged off loan in the last year, thus I will impute with 0.
In [4]:
#this logic will be important for flask data entry.
float_columns = df.select_dtypes(include=['float64']).columns
for col in float_columns:
if "mths" not in col:
df[col].fillna(df[col].median(), inplace=True)
else:
if col == "inq_last_6mths":
df[col].fillna(0, inplace=True)
elif col == "mths_since_last_delinq":
df[col].fillna(999, inplace=True)
elif col == "mths_since_last_record":
df[col].fillna(999, inplace=True)
elif col == "collections_12_mths_ex_med":
df[col].fillna(0, inplace=True)
elif col == "mths_since_last_major_derog":
df[col].fillna(999, inplace=True)
elif col == "mths_since_rcnt_il":
df[col].fillna(999, inplace=True)
elif col == "acc_open_past_24mths":
df[col].fillna(0, inplace=True)
elif col == "chargeoff_within_12_mths":
df[col].fillna(0, inplace=True)
elif col == "mths_since_recent_bc":
df[col].fillna(999, inplace=True)
elif col == "mths_since_recent_bc_dlq":
df[col].fillna(999, inplace=True)
elif col == "mths_since_recent_inq":
df[col].fillna(999, inplace=True)
elif col == "mths_since_recent_revol_delinq":
df[col].fillna(999, inplace=True)
In [ ]:
scaler = StandardScaler()
matrix_df = df.as_matrix()
matrix = scaler.fit_transform(matrix_df)
scaled_df = pd.DataFrame(matrix, columns=df.columns)
In [ ]:
scaled_df.shape
In [5]:
pickle_object(df, "CLASSIFICATION DF")
In [ ]:
pickle_object(scaled_df, "GLM DATAFRAME")
In [ ]:
#legacy code - how I would implement a random forest imputation
# good_features = df[df['mths_since_last_record'].notnull()]
# good_values = good_features.drop(['mths_since_last_record', 'loan_status_Late'], axis=1).values
# good_indicies = good_features.index
# good_target = df.loc[good_indicies, :]['mths_since_last_record'].values
# to_predict_array = df[df['mths_since_last_record'].isnull()].drop(['mths_since_last_record', 'loan_status_Late'], axis=1).values
# to_prediact_index = df[df['mths_since_last_record'].isnull()].index
# model = RandomForestClassifier(n_estimators=25,criterion='entropy', n_jobs=-1)
# model.fit(good_values, good_target)
# impute_values = model.predict(to_predict_array)
# # df.loc[to_predict_index, 'mths_since_last_record'] = impute_values
In [ ]: