Resources

  1. main page
  2. solutions

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [78]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression




In [81]:
## helper function
def test_blender_model(selected_features = None):
    models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

    blender = dsession.blend_models(models = models, 
                                blender = LogisticRegression(), 
                                feature_names = selected_features)
    
    train_matrix, test_matrix = dsession.get_data(selected_features)
    trainyhat = blender.predict(train_matrix)
    testyhat = blender.predict(test_matrix)

    train_y, test_y = train_matrix.loss, test_matrix.loss

    print 'train auc:', roc_auc_score(train_y, trainyhat)
    print 'train accuracy:', np.mean(train_y == (trainyhat>=0.55))
    print 'train f1:', f1_score(train_y, trainyhat>=0.55)
    print 'test auc:', roc_auc_score(test_y, testyhat)
    print 'test accuracy:', np.mean(test_y == (testyhat>=0.55))
    print 'test f1:', f1_score(test_y, testyhat>=0.55)

In [67]:
## load train data
data = pd.read_csv("data/loan_default_prediction/train_v2.csv")
data = data.set_index("id", drop=True)
data.loss = np.where(data.loss == 0, 0, 1) ## make it a binary classification
actual_numers = ['f137','f138','f206','f207','f276','f277','f338','f390','f391','f419',
 'f420','f466','f469','f472','f534','f537','f626','f627','f695','f698']
for f in actual_numers:
    data[f] = data[f].astype(np.float)
data.head(3)


Out[67]:
f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 ... f770 f771 f772 f773 f774 f775 f776 f777 f778 loss
id
1 126 10 0.686842 1100 3 13699 7201 4949 126.75 126.03 ... 5 2.14 -1.54 1.18 0.1833 0.7873 1 0 5 0
2 121 10 0.782776 1100 3 84645 240 1625 123.52 121.35 ... 6 0.54 -0.24 0.13 0.1926 -0.6787 1 0 5 0
3 126 10 0.500080 1100 3 83607 1800 1527 127.76 126.49 ... 13 2.89 -1.73 1.04 0.2521 0.7258 1 0 5 0

3 rows × 770 columns


In [68]:
## create a session for exploration
dsession = session.Session(data, target_feature = "loss", test_frac = 0.3, random_state=0)
transformers = []
print dsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [69]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


757
12

In [70]:
## Know what you are dealing with
pd.value_counts(data.loss)


Out[70]:
0    95688
1     9783
dtype: int64

In [71]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print len(na_feats)


525

In [72]:
## find and remove heavy missing values
heavyna_feats = dsession.get_features_of(dsession.is_na_heavy)
print heavyna_feats


[]

In [73]:
## impute missing values
imputer = dsession.impute_features(auto_remove=True)
transformers.append(imputer)

In [74]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)


495

In [76]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))


620
231

In [82]:
## test - imputed features + removed noninformative + evenized skew 
test_blender_model()


train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.677289680161
test accuracy: 0.905821376651
test logloss: 0.00134048257373

In [83]:
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)

In [84]:
## test - imputed features + removed noninformative + evenized skew + whiten
test_blender_model()


train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.676509920186
test accuracy: 0.905821376651
test logloss: 0.00134048257373

In [91]:
## find mutual redundante features
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)


402

In [92]:
## test - imputed features + removed noninformative + evenized skew + whiten + removed redundant
test_blender_model()


train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.679251169965
test accuracy: 0.905821376651
test logloss: 0.00134048257373

In [93]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)

In [96]:
## find mutual redundante features again
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)


29

In [95]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break


f422_IS_IMPUTED_0_NUMERIZED 0.538561648678

In [97]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)


351

In [98]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:100])


Out[98]:
<munging.session.Session at 0x7f34915648d0>

In [100]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[100:200])


Out[100]:
<munging.session.Session at 0x7f34915648d0>

In [101]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[200:])


Out[101]:
<munging.session.Session at 0x7f34915648d0>

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

due to the design of this data - a lot of artifical noise, a lot of artifical features such as arithmetic of original features - the winning solutions are mostly based on ARTIFICAL feature learning


In [16]:
## load submission data
submission_data = pd.read_csv("data/loan_default_prediction/test_v2.csv")
submission_data = submission_data.set_index("id", drop = True)
submission_data.head(3)


Out[16]:
f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 ... f769 f770 f771 f772 f773 f774 f775 f776 f777 f778
id
105472 147 6 0.487058 1100 17 75506 964 12686 152.63 115.91 ... -8.71 19 3.30 -9.37 0.50 0.0539 -1.0733 0 1 1079
105473 156 6 0.728518 5400 3 79754 1455 4803 153.95 155.50 ... -13.26 24 9.53 -7.55 6.22 0.3030 0.6087 0 1 36
105474 132 9 0.898133 2200 16 113 5735 2387 130.61 131.96 ... -4.99 9 3.25 -2.33 1.69 0.2317 0.4184 0 0 393

3 rows × 769 columns


In [41]:
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)


(210944, 1282)
         f1  f2        f3    f4  f5     f6      f9     f10  f13       f16  \
id                                                                          
105472  147   6  0.487058  1100  17  75506  152.63  115.91   12  25503199   
105473  156   6  0.728518  5400   3  79754  153.95  155.50   12   1679764   
105474  132   9  0.898133  2200  16    113  130.61  131.96   13  15984334   

          ...     f79_IMPUTED  f79_IS_IMPUTED  f75_IMPUTED  f75_IS_IMPUTED  \
id        ...                                                                
105472    ...          148894           False         8.12           False   
105473    ...           85957           False         7.08           False   
105474    ...          299252           False         9.18           False   

        f76_IMPUTED  f76_IS_IMPUTED  f77_IMPUTED  f77_IS_IMPUTED  f72_IMPUTED  \
id                                                                              
105472         3.64           False         8.22           False     0.913735   
105473         4.79           False         6.12           False    -0.250000   
105474        11.88           False         6.03           False    -0.080000   

        f72_IS_IMPUTED  
id                      
105472            True  
105473           False  
105474           False  

[3 rows x 1282 columns]

In [ ]: