Resources

  1. main page
  2. solutions

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [78]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression




In [81]:
## helper function
def test_blender_model(selected_features = None):
    models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

    blender = dsession.blend_models(models = models, 
                                blender = LogisticRegression(), 
                                feature_names = selected_features)
    
    train_matrix, test_matrix = dsession.get_data(selected_features)
    trainyhat = blender.predict(train_matrix)
    testyhat = blender.predict(test_matrix)

    train_y, test_y = train_matrix.loss, test_matrix.loss

    print 'train auc:', roc_auc_score(train_y, trainyhat)
    print 'train accuracy:', np.mean(train_y == (trainyhat>=0.55))
    print 'train f1:', f1_score(train_y, trainyhat>=0.55)
    print 'test auc:', roc_auc_score(test_y, testyhat)
    print 'test accuracy:', np.mean(test_y == (testyhat>=0.55))
    print 'test f1:', f1_score(test_y, testyhat>=0.55)

In [67]:
## load train data
data = pd.read_csv("data/loan_default_prediction/train_v2.csv")
data = data.set_index("id", drop=True)
data.loss = np.where(data.loss == 0, 0, 1) ## make it a binary classification
actual_numers = ['f137','f138','f206','f207','f276','f277','f338','f390','f391','f419',
 'f420','f466','f469','f472','f534','f537','f626','f627','f695','f698']
for f in actual_numers:
    data[f] = data[f].astype(np.float)
data.head(3)


Out[67]:
f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 ... f770 f771 f772 f773 f774 f775 f776 f777 f778 loss
id
1 126 10 0.686842 1100 3 13699 7201 4949 126.75 126.03 ... 5 2.14 -1.54 1.18 0.1833 0.7873 1 0 5 0
2 121 10 0.782776 1100 3 84645 240 1625 123.52 121.35 ... 6 0.54 -0.24 0.13 0.1926 -0.6787 1 0 5 0
3 126 10 0.500080 1100 3 83607 1800 1527 127.76 126.49 ... 13 2.89 -1.73 1.04 0.2521 0.7258 1 0 5 0

3 rows × 770 columns


In [68]:
## create a session for exploration
dsession = session.Session(data, target_feature = "loss", test_frac = 0.3, random_state=0)
transformers = []
print dsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [69]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


757
12

In [70]:
## Know what you are dealing with
pd.value_counts(data.loss)


Out[70]:
0    95688
1     9783
dtype: int64

In [71]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print len(na_feats)


525

In [72]:
## find and remove heavy missing values
heavyna_feats = dsession.get_features_of(dsession.is_na_heavy)
print heavyna_feats


[]

In [73]:
## impute missing values
imputer = dsession.impute_features(auto_remove=True)
transformers.append(imputer)

In [74]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)


495

In [76]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))


620
231

In [82]:
## test - imputed features + removed noninformative + evenized skew 
test_blender_model()


train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.677289680161
test accuracy: 0.905821376651
test logloss: 0.00134048257373

In [83]:
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)

In [84]:
## test - imputed features + removed noninformative + evenized skew + whiten
test_blender_model()


train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.676509920186
test accuracy: 0.905821376651
test logloss: 0.00134048257373

In [91]:
## find mutual redundante features
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)


402

In [92]:
## test - imputed features + removed noninformative + evenized skew + whiten + removed redundant
test_blender_model()


train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.679251169965
test accuracy: 0.905821376651
test logloss: 0.00134048257373

In [93]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)

In [96]:
## find mutual redundante features again
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)


29

In [95]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break


f422_IS_IMPUTED_0_NUMERIZED 0.538561648678

In [97]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)


351

In [98]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:100])


Out[98]:
<munging.session.Session at 0x7f34915648d0>