Resources



In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [78]:

    
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression



In [81]:

    
## helper function
def test_blender_model(selected_features = None):
    models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

    blender = dsession.blend_models(models = models, 
                                blender = LogisticRegression(), 
                                feature_names = selected_features)
    
    train_matrix, test_matrix = dsession.get_data(selected_features)
    trainyhat = blender.predict(train_matrix)
    testyhat = blender.predict(test_matrix)

    train_y, test_y = train_matrix.loss, test_matrix.loss

    print 'train auc:', roc_auc_score(train_y, trainyhat)
    print 'train accuracy:', np.mean(train_y == (trainyhat>=0.55))
    print 'train f1:', f1_score(train_y, trainyhat>=0.55)
    print 'test auc:', roc_auc_score(test_y, testyhat)
    print 'test accuracy:', np.mean(test_y == (testyhat>=0.55))
    print 'test f1:', f1_score(test_y, testyhat>=0.55)



In [67]:

    
## load train data
data = pd.read_csv("data/loan_default_prediction/train_v2.csv")
data = data.set_index("id", drop=True)
data.loss = np.where(data.loss == 0, 0, 1) ## make it a binary classification
actual_numers = ['f137','f138','f206','f207','f276','f277','f338','f390','f391','f419',
 'f420','f466','f469','f472','f534','f537','f626','f627','f695','f698']
for f in actual_numers:
    data[f] = data[f].astype(np.float)
data.head(3)









    Out[67]:






  
    
      
      f1
      f2
      f3
      f4
      f5
      f6
      f7
      f8
      f9
      f10
      ...
      f770
      f771
      f772
      f773
      f774
      f775
      f776
      f777
      f778
      loss
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
       126
       10
       0.686842
       1100
       3
       13699
       7201
       4949
       126.75
       126.03
      ...
        5
       2.14
      -1.54
       1.18
       0.1833
       0.7873
       1
       0
       5
       0
    
    
      2
       121
       10
       0.782776
       1100
       3
       84645
        240
       1625
       123.52
       121.35
      ...
        6
       0.54
      -0.24
       0.13
       0.1926
      -0.6787
       1
       0
       5
       0
    
    
      3
       126
       10
       0.500080
       1100
       3
       83607
       1800
       1527
       127.76
       126.49
      ...
       13
       2.89
      -1.73
       1.04
       0.2521
       0.7258
       1
       0
       5
       0
    
  

3 rows × 770 columns



In [68]:

    
## create a session for exploration
dsession = session.Session(data, target_feature = "loss", test_frac = 0.3, random_state=0)
transformers = []
print dsession.get_parameters()









    



{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}



In [69]:

    
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)



In [70]:

    
## Know what you are dealing with
pd.value_counts(data.loss)









    Out[70]:





0    95688
1     9783
dtype: int64



In [71]:

    
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print len(na_feats)



In [72]:

    
## find and remove heavy missing values
heavyna_feats = dsession.get_features_of(dsession.is_na_heavy)
print heavyna_feats

[]



In [73]:

    
## impute missing values
imputer = dsession.impute_features(auto_remove=True)
transformers.append(imputer)



In [74]:

    
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)



In [76]:

    
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))



In [82]:

    
## test - imputed features + removed noninformative + evenized skew 
test_blender_model()









    



train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.677289680161
test accuracy: 0.905821376651
test logloss: 0.00134048257373



In [83]:

    
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)



In [84]:

    
## test - imputed features + removed noninformative + evenized skew + whiten
test_blender_model()









    



train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.676509920186
test accuracy: 0.905821376651
test logloss: 0.00134048257373



In [91]:

    
## find mutual redundante features
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)



In [92]:

    
## test - imputed features + removed noninformative + evenized skew + whiten + removed redundant
test_blender_model()









    



train auc: 1.0
train accuracy: 1.0
train logloss: 1.0
test auc: 0.679251169965
test accuracy: 0.905821376651
test logloss: 0.00134048257373



In [93]:

    
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)



In [96]:

    
## find mutual redundante features again
redundant_feats = dsession.find_redundant_features()
print len(redundant_feats)
remover = dsession.remove_features(feature_names = redundant_feats)
transformers.append(remover)



In [95]:

    
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break









    



f422_IS_IMPUTED_0_NUMERIZED 0.538561648678



In [97]:

    
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)



In [98]:

    
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:100])









    Out[98]:





<munging.session.Session at 0x7f34915648d0>



In [100]:

    
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[100:200])









    Out[100]:





<munging.session.Session at 0x7f34915648d0>



In [101]:

    
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[200:])









    Out[101]:





<munging.session.Session at 0x7f34915648d0>



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

due to the design of this data - a lot of artifical noise, a lot of artifical features such as arithmetic of original features - the winning solutions are mostly based on ARTIFICAL feature learning



In [16]:

    
## load submission data
submission_data = pd.read_csv("data/loan_default_prediction/test_v2.csv")
submission_data = submission_data.set_index("id", drop = True)
submission_data.head(3)









    Out[16]:






  
    
      
      f1
      f2
      f3
      f4
      f5
      f6
      f7
      f8
      f9
      f10
      ...
      f769
      f770
      f771
      f772
      f773
      f774
      f775
      f776
      f777
      f778
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      105472
       147
       6
       0.487058
       1100
       17
       75506
        964
       12686
       152.63
       115.91
      ...
       -8.71
       19
       3.30
      -9.37
       0.50
       0.0539
      -1.0733
       0
       1
       1079
    
    
      105473
       156
       6
       0.728518
       5400
        3
       79754
       1455
        4803
       153.95
       155.50
      ...
      -13.26
       24
       9.53
      -7.55
       6.22
       0.3030
       0.6087
       0
       1
         36
    
    
      105474
       132
       9
       0.898133
       2200
       16
         113
       5735
        2387
       130.61
       131.96
      ...
       -4.99
        9
       3.25
      -2.33
       1.69
       0.2317
       0.4184
       0
       0
        393
    
  

3 rows × 769 columns



In [41]:

    
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)









    



(210944, 1282)
         f1  f2        f3    f4  f5     f6      f9     f10  f13       f16  \
id                                                                          
105472  147   6  0.487058  1100  17  75506  152.63  115.91   12  25503199   
105473  156   6  0.728518  5400   3  79754  153.95  155.50   12   1679764   
105474  132   9  0.898133  2200  16    113  130.61  131.96   13  15984334   

          ...     f79_IMPUTED  f79_IS_IMPUTED  f75_IMPUTED  f75_IS_IMPUTED  \
id        ...                                                                
105472    ...          148894           False         8.12           False   
105473    ...           85957           False         7.08           False   
105474    ...          299252           False         9.18           False   

        f76_IMPUTED  f76_IS_IMPUTED  f77_IMPUTED  f77_IS_IMPUTED  f72_IMPUTED  \
id                                                                              
105472         3.64           False         8.22           False     0.913735   
105473         4.79           False         6.12           False    -0.250000   
105474        11.88           False         6.03           False    -0.080000   

        f72_IS_IMPUTED  
id                      
105472            True  
105473           False  
105474           False  

[3 rows x 1282 columns]



In [ ]:

	f1	f2	f3	f4	f5	f6	f7	f8	f9	f10	...	f770	f771	f772	f773	f774	f775	f776	f777	f778	loss
id
1	126	10	0.686842	1100	3	13699	7201	4949	126.75	126.03	...	5	2.14	-1.54	1.18	0.1833	0.7873	1	0	5	0
2	121	10	0.782776	1100	3	84645	240	1625	123.52	121.35	...	6	0.54	-0.24	0.13	0.1926	-0.6787	1	0	5	0
3	126	10	0.500080	1100	3	83607	1800	1527	127.76	126.49	...	13	2.89	-1.73	1.04	0.2521	0.7258	1	0	5	0

	f1	f2	f3	f4	f5	f6	f7	f8	f9	f10	...	f769	f770	f771	f772	f773	f774	f775	f776	f777	f778
id
105472	147	6	0.487058	1100	17	75506	964	12686	152.63	115.91	...	-8.71	19	3.30	-9.37	0.50	0.0539	-1.0733	0	1	1079
105473	156	6	0.728518	5400	3	79754	1455	4803	153.95	155.50	...	-13.26	24	9.53	-7.55	6.22	0.3030	0.6087	0	1	36
105474	132	9	0.898133	2200	16	113	5735	2387	130.61	131.96	...	-4.99	9	3.25	-2.33	1.69	0.2317	0.4184	0	0	393