In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [78]:
from munging import session
from munging import transform
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
In [81]:
## helper function
def test_blender_model(selected_features = None):
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]
blender = dsession.blend_models(models = models,
blender = LogisticRegression(),
feature_names = selected_features)
train_matrix, test_matrix = dsession.get_data(selected_features)
trainyhat = blender.predict(train_matrix)
testyhat = blender.predict(test_matrix)
train_y, test_y = train_matrix.loss, test_matrix.loss
print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.55))
print 'train f1:', f1_score(train_y, trainyhat>=0.55)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.55))
print 'test f1:', f1_score(test_y, testyhat>=0.55)
In [67]:
## load train data
data = pd.read_csv("data/loan_default_prediction/train_v2.csv")
data = data.set_index("id", drop=True)
data.loss = np.where(data.loss == 0, 0, 1) ## make it a binary classification
actual_numers = ['f137','f138','f206','f207','f276','f277','f338','f390','f391','f419',
'f420','f466','f469','f472','f534','f537','f626','f627','f695','f698']
for f in actual_numers:
data[f] = data[f].astype(np.float)
data.head(3)
Out[67]:
In [68]:
## create a session for exploration
dsession = session.Session(data, target_feature = "loss", test_frac = 0.3, random_state=0)
transformers = []
print dsession.get_parameters()
In [69]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)
In [70]:
## Know what you are dealing with
pd.value_counts(data.loss)
Out[70]:
In [71]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print len(na_feats)
In [72]:
## find and remove heavy missing values
heavyna_feats = dsession.get_features_of(dsession.is_na_heavy)
print heavyna_feats
In [73]:
## impute missing values
imputer = dsession.impute_features(auto_remove=True)
transformers.append(imputer)
In [74]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)
In [76]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))
In [82]:
## test - imputed features + removed noninformative + evenized skew
test_blender_model()
In [83]:
## whiten features
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler
whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)
In [84]:
## test - imputed features + removed noninformative + evenized skew + whiten
test_blender_model()
In [91]:
## find mutual redundante features
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)
In [92]:
## test - imputed features + removed noninformative + evenized skew + whiten + removed redundant
test_blender_model()
In [93]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)
In [96]:
## find mutual redundante features again
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)
In [95]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features,
by = dsession.numerized_feature_auc_metric,
target_value = 0)
for f, s in numerized_features_rank:
print f, s
if s <= 0.55: break
In [97]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
if f not in numerized_features]
print len(original_numerical_feats)
In [98]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:100])
Out[98]: