Resources



In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [4]:

    
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split



In [28]:

    
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")
data.head(3)









    Out[28]:






  
    
      
      Activity
      D1
      D2
      D3
      D4
      D5
      D6
      D7
      D8
      D9
      ...
      D1767
      D1768
      D1769
      D1770
      D1771
      D1772
      D1773
      D1774
      D1775
      D1776
    
  
  
    
      0
       1
       0.000000
       0.497009
       0.10
       0
       0.132956
       0.678031
       0.273166
       0.585445
       0.743663
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      1
       1
       0.366667
       0.606291
       0.05
       0
       0.111209
       0.803455
       0.106105
       0.411754
       0.836582
      ...
       1
       1
       1
       1
       0
       1
       0
       0
       1
       0
    
    
      2
       1
       0.033300
       0.480124
       0.00
       0
       0.209791
       0.610350
       0.356453
       0.517720
       0.679051
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
  

3 rows × 1777 columns



In [29]:

    
data_index, submission_index = train_test_split(range(data.shape[0]), test_size = 0.3, random_state = 0)
submission_data = data.iloc[submission_index, :]
data = data.iloc[data_index, :]



In [30]:

    
print data.shape, submission_data.shape









    



(2625, 1777) (1126, 1777)



In [31]:

    
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()









    



{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}



In [32]:

    
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)



In [33]:

    
## Know what you are dealing with
pd.value_counts(data.Activity)









    Out[33]:





1    1405
0    1220
dtype: int64



In [34]:

    
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats

[]



In [16]:

    
## no need to remove heavy missing value features



In [17]:

    
## no need to impute missing values



In [35]:

    
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)



In [36]:

    
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))



In [37]:

    
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)



In [21]:

    
## find mutual redundante features
dsession.find_redundant_features()









    Out[21]:





['D203_LOG_WHITE']



In [38]:

    
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)



In [23]:

    
## find redudant features as the numerization of categorical may bring some
dsession.find_redundant_features()









    Out[23]:





['D195_LOG1_WHITE']



In [39]:

    
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break









    



D27_0_NUMERIZED 0.736503093482
D1087_0_NUMERIZED 0.595845801795
D1036_0_NUMERIZED 0.595249271451
D1083_0_NUMERIZED 0.592641488522
D1061_0_NUMERIZED 0.590708469427
D1089_0_NUMERIZED 0.585238644735
D954_0_NUMERIZED 0.583778286295
D993_0_NUMERIZED 0.582823185798
D959_0_NUMERIZED 0.581251996584
D979_0_NUMERIZED 0.576329806307
D1143_0_NUMERIZED 0.573911087641
D995_0_NUMERIZED 0.572486586217
D1180_0_NUMERIZED 0.572007406104
D1109_0_NUMERIZED 0.570798046771
D992_0_NUMERIZED 0.570752410569
D1004_0_NUMERIZED 0.57021455534
D1169_0_NUMERIZED 0.569226857556
D1155_0_NUMERIZED 0.568689002327
D1002_0_NUMERIZED 0.568007719037
D1125_0_NUMERIZED 0.56800119958
D1150_0_NUMERIZED 0.567143890943
D988_0_NUMERIZED 0.566772281875
D1133_0_NUMERIZED 0.565712870061
D1338_0_NUMERIZED 0.565445572311
D996_0_NUMERIZED 0.563515812944
D1390_0_NUMERIZED 0.56255093326
D1193_0_NUMERIZED 0.561126431836
D1005_0_NUMERIZED 0.560569018235
D1341_0_NUMERIZED 0.560497304204
D1196_0_NUMERIZED 0.56011917568
D1281_0_NUMERIZED 0.558769648014
D1066_0_NUMERIZED 0.558596882396
D1190_0_NUMERIZED 0.557022433453
D1168_0_NUMERIZED 0.556892044306
D1106_0_NUMERIZED 0.556474799038
D1077_0_NUMERIZED 0.556197722102
D1128_0_NUMERIZED 0.555620750129
D1160_0_NUMERIZED 0.554998141955
D1113_0_NUMERIZED 0.554049560915
D1434_0_NUMERIZED 0.553397615183
D967_0_NUMERIZED 0.55335523871
D1164_0_NUMERIZED 0.552921694798
D1337_0_NUMERIZED 0.552784786194
D1445_0_NUMERIZED 0.552768487551
D1096_0_NUMERIZED 0.551839464883
D1403_0_NUMERIZED 0.551011493803
D1441_0_NUMERIZED 0.550828948998
D960_0_NUMERIZED 0.550467119117
D1174_0_NUMERIZED 0.548889410445



In [40]:

    
selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
#selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089', 
#                            'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats









    



['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED']



In [249]:

    
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)



In [143]:

    
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:60])









    Out[143]:





<munging.session.Session at 0x7fdb73cdb250>



In [144]:

    
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[60:260])









    Out[144]:





<munging.session.Session at 0x7fdb73cdb250>



In [145]:

    
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[260:])









    Out[145]:





<munging.session.Session at 0x7fdb73cdb250>



In [43]:

    
selected_numerical_feats = []
selected_numerical_feats += ["D%i_WHITE" % i for i in [6, 7, 10, 17, 46, 70, 126, 152, 177, 659]]
selected_numerical_feats += ["D%i_LOG1_WHITE" % i for i in [130, 75, 88, 911, 32, 47, 56 ]]
selected_numerical_feats += ["D%i_LOG_WHITE" % i for i in [5]]
print selected_numerical_feats









    



['D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D5_LOG_WHITE']



In [44]:

    
for f in selected_numerical_feats:
    print f, f in dsession.get_all_input_features()









    



D6_WHITE True
D7_WHITE True
D10_WHITE True
D17_WHITE True
D46_WHITE True
D70_WHITE True
D126_WHITE True
D152_WHITE True
D177_WHITE True
D659_WHITE True
D130_LOG1_WHITE True
D75_LOG1_WHITE True
D88_LOG1_WHITE True
D911_LOG1_WHITE True
D32_LOG1_WHITE True
D47_LOG1_WHITE True
D56_LOG1_WHITE True
D5_LOG_WHITE True



In [45]:

    
seleted_feats = selected_numerized_feats + selected_numerical_feats
print seleted_feats
selected_train, selected_test = dsession.get_data(selected_features=seleted_feats)
print selected_train.shape, selected_test.shape
selected_train_X, selected_train_y = selected_train.iloc[:, :-1], selected_train.iloc[:, -1]
selected_test_X, selected_test_y = selected_test.iloc[:, :-1], selected_test.iloc[:, -1]









    



['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D5_LOG_WHITE']
(1837, 29) (788, 29)



In [50]:

    
def logloss(ytrue, yhat):
    return -np.mean(np.log(np.where(ytrue == 1, yhat, 1-yhat)))



In [54]:

    
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
tree = DecisionTreeClassifier(max_depth=3) 
tree.fit(selected_train_X, selected_train_y)
print roc_auc_score(selected_train_y, tree.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, tree.predict_proba(selected_test_X)[:, 1])
print tree.score(selected_train_X, selected_train_y)
print tree.score(selected_test_X, selected_test_y)









    



0.803888101967
0.765287990196
0.766476190476
0.735346358792



In [55]:

    
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
model = SVC(probability=True, kernel = "rbf", gamma = 0.05) 
model.fit(selected_train_X, selected_train_y)
print roc_auc_score(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, model.predict_proba(selected_test_X)[:, 1])
print model.score(selected_train_X, selected_train_y)
print model.score(selected_test_X, selected_test_y)
print logloss(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print logloss(selected_test_y, model.predict_proba(selected_test_X)[:, 1])









    



0.882925072046
0.807723813209
0.810285714286
0.747779751332
0.436796220876
0.533893538368



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [78]:

    
## load submission data
submission_data = pd.read_csv("data/predicting_biological_response/test.csv")
submission_data.head(3)









    Out[78]:






  
    
      
      D1
      D2
      D3
      D4
      D5
      D6
      D7
      D8
      D9
      D10
      ...
      D1767
      D1768
      D1769
      D1770
      D1771
      D1772
      D1773
      D1774
      D1775
      D1776
    
  
  
    
      0
       0.366667
       0.611765
       0.05
       0
       0.110435
       0.803973
       0.106075
       0.473965
       0.835617
       0.106452
      ...
       1
       1
       1
       1
       0
       1
       0
       0
       1
       0
    
    
      1
       0.100000
       0.758175
       0.30
       0
       0.180128
       0.621378
       0.287144
       0.503919
       0.674919
       0.403616
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      2
       0.100000
       0.658812
       0.10
       0
       0.243421
       0.640959
       0.312765
       0.279784
       0.686775
       0.280301
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
  

3 rows × 1776 columns



In [79]:

    
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)









    



(2501, 1776)
         D1        D2    D3  D4        D5        D6        D7        D8  \
0  0.366667  0.611765  0.05   0  0.110435  0.803973  0.106075  0.473965   
1  0.100000  0.758175  0.30   0  0.180128  0.621378  0.287144  0.503919   
2  0.100000  0.658812  0.10   0  0.243421  0.640959  0.312765  0.279784   

         D9       D10    ...     D1767  D1768  D1769  D1770  D1771  D1772  \
0  0.835617  0.106452    ...         1      1      1      1      0      1   
1  0.674919  0.403616    ...         0      0      0      0      0      0   
2  0.686775  0.280301    ...         0      0      0      0      0      0   

   D1773  D1774  D1775  D1776  
0      0      0      1      0  
1      0      0      0      0  
2      0      0      0      0  

[3 rows x 1776 columns]



In [58]:

    
selected_submission = transformed_submission.loc[:, seleted_feats]
print selected_submission.shape



In [59]:

    
submission_y = model.predict_proba(selected_submission)[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission1.csv", 
                           header = True, index = False)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [28]:

    
## svm benchmark
#feats = ['D27', 'D1036', 'D995', 'D1087', 'D1061', 'D979', 'D1002', 'D1169', 'D996', 'D993', 
#         'D6', 'D7', 'D10', 'D17', 'D46', 'D70', 'D126', 'D152', 'D177', 'D659', 
#         'D130', 'D131', 'D75', 'D88', 'D911', 'D103', 'D32', 'D47', 'D56', 'D16', 'D5']
feats = submission_data.columns
X = data.iloc[:, 1:].loc[:, feats]
y = data.iloc[:, 0]
from sklearn.svm import SVC
svc = SVC(probability = True)
svc.fit(X, y)









    Out[28]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [29]:

    
print roc_auc_score(y, svc.predict_proba(X)[:, 1])
print svc.score(X, y)









    



0.841063739378
0.763529725407



In [30]:

    
submission_yy = svc.predict_proba(submission_data.loc[:, feats])[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_yy
submission_solution.to_csv("data/predicting_biological_response/benchmark_submission.csv", 
                           header = True, index = False)



In [31]:

    
logloss(y, svc.predict_proba(X)[:, 1])









    Out[31]:





0.49333113650297228



In [ ]:



In [ ]:



In [32]:

    
from munging import model



In [33]:

    
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

blender = model.ModelBlender(feature_names = data.columns[1:], 
                             target_name = "Activity", 
                             models = models,
                             blender = LogisticRegression())



In [34]:

    
blender.fit(data)









    Out[34]:





<munging.model.ModelBlender at 0x7fb1bb5528d0>



In [35]:

    
yhat = blender.predict(data)



In [36]:

    
plt.plot(y, yhat, ".")









    Out[36]:





[<matplotlib.lines.Line2D at 0x7fb1bb480f50>]



In [ ]:



In [46]:

    
selected_features = seleted_feats
print selected_features
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

blender = dsession.blend_models(models = models, 
                                blender = LogisticRegression(), 
                                feature_names = selected_features)









    



['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D5_LOG_WHITE']



In [47]:

    
blender.blender.coef_









    Out[47]:





array([[ 1.84255462,  1.65119793,  1.23298551,  0.40130147]])



In [51]:

    
from sklearn.metrics import roc_auc_score
train_matrix, test_matrix = dsession.get_data(selected_features)
trainyhat = blender.predict(train_matrix)
testyhat = blender.predict(test_matrix)

train_y, test_y = train_matrix.Activity, test_matrix.Activity

print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)









    



train auc: 1.0
train accuracy: 1.0
train logloss: 0.155293915676
test auc: 0.85355994967
test accuracy: 0.779187817259
test logloss: 0.47765519017



In [53]:

    
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)

submission_matrix = transformed_submission.loc[:, selected_features]
submission_y = transformed_submission.Activity
submission_yhat = blender.predict(submission_matrix)

print 'submission auc:', roc_auc_score(submission_y, submission_yhat)
print 'submission accuracy:', np.mean(submission_y == (submission_yhat>=0.5))
print 'submission logloss:', logloss(submission_y, submission_yhat)









    



(1126, 943)
   Activity  D2_WHITE  D6_WHITE  D7_WHITE  D8_WHITE  D9_WHITE  D10_WHITE  \
0         1 -0.000711 -0.976777  1.039241 -0.063431 -0.784041   0.622562   
1         1 -0.000711 -0.997865  1.054865  0.102651 -0.788162   0.624411   
2         0  0.038867 -0.977859  1.306818  0.706703 -0.445959   0.771932   

   D15_WHITE  D16_WHITE  D17_WHITE    ...      D1440_0_NUMERIZED  \
0   0.971653   0.178595  -0.767552    ...               0.455594   
1   0.971475   0.179414  -0.777047    ...               0.455594   
2   1.716197   1.010969  -0.787602    ...               0.455594   

   D1441_0_NUMERIZED  D1442_0_NUMERIZED  D1443_0_NUMERIZED  D1444_0_NUMERIZED  \
0           0.465247           0.459459           0.452613           0.446243   
1           0.465247           0.459459           0.452613           0.446243   
2           0.465247           0.459459           0.452613           0.446243   

   D1445_0_NUMERIZED  D1446_0_NUMERIZED  D1447_0_NUMERIZED  D1448_0_NUMERIZED  \
0            0.46683           0.457091           0.465978            0.46089   
1            0.46683           0.457091           0.465978            0.46089   
2            0.46683           0.457091           0.465978            0.46089   

   D1449_0_NUMERIZED  
0           0.456697  
1           0.456697  
2           0.456697  

[3 rows x 943 columns]
submission auc: 0.843659732641
submission accuracy: 0.768206039076
submission logloss: 0.487532549337



In [258]:

    
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)









    



(2501, 943)
   D27  D28  D51  D94  D952  D953  D954  D955  D956  D957  ...   \
0    0    0    1    1     1     0     0     1     1     0  ...    
1    1    1    0    1     0     0     1     0     0     1  ...    
2    0    1    0    1     1     0     0     0     0     1  ...    

   D558_LOG1_WHITE  D559_LOG1_WHITE  D556_LOG1_WHITE  D557_LOG1_WHITE  \
0         6.756420         5.597796         3.652312         7.913760   
1        -0.215938        -0.200025        -0.177183        -0.199862   
2        -0.215938        -0.200025        -0.177183        -0.199862   

   D554_LOG1_WHITE  D555_LOG1_WHITE  D223_LOG1_WHITE  D389_LOG1_WHITE  \
0         7.057772        -0.186079        -0.418874         4.006732   
1        -0.238913        -0.186079        -0.418874        -0.291488   
2        -0.238913        -0.186079        -0.418874        -0.291488   

   D388_LOG1_WHITE  D386_LOG1_WHITE  
0         2.141151        -0.242842  
1         4.296623        -0.242842  
2        -0.268462         3.833368  

[3 rows x 943 columns]



In [162]:

    
submission_y = blender.predict(transformed_submission.loc[:, selected_features])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission2.csv", 
                           header = True, index = False)

Experimental Results

blender models are tree models

models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

All features + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.107362817617
- test auc: 0.87296746996
- test accuracy: 0.790408525755
- test logloss: 0.445716523203
- submission logloss: 0.41044
Removed noninformative + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.0980945362982
- test auc: 0.864868063708
- test accuracy: 0.794849023091
- test logloss: 0.455761545362
- submission logloss: 0.41474
Removed noninformative + evenized skewed features + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.0973121344046
- test auc: 0.864688928483
- test accuracy: 0.793960923623
- test logloss: 0.45576819434
- submission logloss: 0.41373
Removed noninformative + evenized skewed features + whitened features + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.100104966799
- test auc: 0.863962791055
- test accuracy: 0.794849023091
- test logloss: 0.456526266183
- submission logloss: 0.41375
Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.106581348311
- test auc: 0.876126068811
- test accuracy: 0.801065719361
- test logloss: 0.441049104199
- submission logloss: 1.02392
Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.0929543788381
- test auc: 0.879660143526
- test accuracy: 0.809058614565
- test logloss: 0.434795974692
- submission logloss: 1.02953
Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + selected numerical features + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.13909023595
- test auc: 0.855300157777
- test accuracy: 0.781527531083
- test logloss: 0.473516181104
- submission logloss: 0.97276
Removed noninformative + evenized skewed features + whitened features + selected categorical features based on their numerized values + selected numerical features + blender model:
- train auc: 1.0
- train accuracy: 1.0
- train logloss: 0.135945958172
- test auc: 0.85161205708
- test accuracy: 0.775310834813
- test logloss: 0.474685622346
- submission logloss: 0.44639

Some observations:

1. there is a high risk to numerize categorical features, specially when there are many levels in the categorical feature, because what is observed in the training data (and even validation data) may not well apply to the new data (e.g., the submission data) - this can be seen by observing that the logloss, auc, and accuracy of train and test not change too much along different data transformation, but the submission accuracy drops a lot
2. feature selection based on numerized categorical, and numerical density, however, seems to improve the generality
3. considerting the blender model used here is a combination of tree-based models, it should be the most robust to the data transforms

As a trend in modern data mining activities, a lot of people prefer using an ensemble model to manual feature selection



In [170]:

    
from IPython.display import display_html
for f in ['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176']:
    print '='*20, f, '='*20
    print(pd.value_counts(data[f]))
    print(pd.value_counts(submission_data[f]))









    



==================== D27 ====================
1    1953
0    1798
dtype: int64
1    1324
0    1177
dtype: int64
==================== D1036 ====================
1    1981
0    1770
dtype: int64
1    1318
0    1183
dtype: int64
==================== D1004 ====================
1    2069
0    1682
dtype: int64
1    1403
0    1098
dtype: int64
==================== D979 ====================
1    2378
0    1373
dtype: int64
1    1568
0     933
dtype: int64
==================== D1089 ====================
0    2403
1    1348
dtype: int64
0    1643
1     858
dtype: int64
==================== D1109 ====================
1    2112
0    1639
dtype: int64
1    1381
0    1120
dtype: int64
==================== D1125 ====================
0    2068
1    1683
dtype: int64
0    1421
1    1080
dtype: int64
==================== D1061 ====================
0    2805
1     946
dtype: int64
0    1819
1     682
dtype: int64
==================== D954 ====================
0    2509
1    1242
dtype: int64
0    1649
1     852
dtype: int64
==================== D1176 ====================
1    2173
0    1578
dtype: int64
1    1400
0    1101
dtype: int64



In [179]:

    
dsession.print_categorial_crosstable(feature_names=["D1176"])



In [ ]:



In [ ]:



In [252]:

    
selected_features = seleted_feats
print selected_features


blender = SVC(probability = True)









    



['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D131_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D103_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D16_LOG1_WHITE', 'D5_LOG_WHITE']



In [253]:

    
train_matrix, test_matrix = dsession.get_data(selected_features)
train_y, test_y = train_matrix.Activity, test_matrix.Activity

blender.fit(train_matrix.iloc[:, :-1], train_y)

trainyhat = blender.predict_proba(train_matrix.iloc[:, :-1])[:, 1]
testyhat = blender.predict_proba(test_matrix.iloc[:, :-1])[:, 1]


print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)









    



train auc: 0.871670264279
train accuracy: 0.786285714286
train logloss: 0.466823652957
test auc: 0.826993759057
test accuracy: 0.766429840142
test logloss: 0.509194934026



In [254]:

    
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)









    



(2501, 943)
   D27  D28  D51  D94  D952  D953  D954  D955  D956  D957  ...   \
0    0    0    1    1     1     0     0     1     1     0  ...    
1    1    1    0    1     0     0     1     0     0     1  ...    
2    0    1    0    1     1     0     0     0     0     1  ...    

   D558_LOG1_WHITE  D559_LOG1_WHITE  D556_LOG1_WHITE  D557_LOG1_WHITE  \
0         6.756420         5.597796         3.652312         7.913760   
1        -0.215938        -0.200025        -0.177183        -0.199862   
2        -0.215938        -0.200025        -0.177183        -0.199862   

   D554_LOG1_WHITE  D555_LOG1_WHITE  D223_LOG1_WHITE  D389_LOG1_WHITE  \
0         7.057772        -0.186079        -0.418874         4.006732   
1        -0.238913        -0.186079        -0.418874        -0.291488   
2        -0.238913        -0.186079        -0.418874        -0.291488   

   D388_LOG1_WHITE  D386_LOG1_WHITE  
0         2.141151        -0.242842  
1         4.296623        -0.242842  
2        -0.268462         3.833368  

[3 rows x 943 columns]



In [255]:

    
submission_y = (blender.predict_proba(transformed_submission)[:, 1] 
                if selected_features is None 
                else blender.predict_proba(transformed_submission.loc[:, selected_features])[:, 1])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission2.csv", 
                           header = True, index = False)

Experimental Results

blender models are SVM

All features + blender model:
- train auc: 0.833257394551
- train accuracy: 0.758476190476
- train logloss: 0.505849445063
- test auc: 0.792279911584
- test accuracy: 0.722912966252
- test logloss: 0.547326593429
- submission logloss: 0.52190
Removed noninformative + blender model:
- train auc: 0.857131439239
- train accuracy: 0.776380952381
- train logloss: 0.473122160467
- test auc: 0.815580286169
- test accuracy: 0.751332149201
- test logloss: 0.521406446231
- submission logloss: 0.49046
Removed noninformative + evenized skewed features + blender model:
- train auc: 0.856609299341
- train accuracy: 0.775619047619
- train logloss: 0.473663198953
- test auc: 0.815685847997
- test accuracy: 0.753108348135
- test logloss: 0.520802184118
- submission logloss: 0.48935
Removed noninformative + evenized skewed features + whitened features + blender model:
- train auc: 0.925573770492
- train accuracy: 0.843428571429
- train logloss: 0.366437909086
- test auc: 0.83826648284
- test accuracy: 0.771758436945
- test logloss: 0.493848022056
- submission logloss: 0.44420
Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + blender model:
- train auc: 0.903969376425
- train accuracy: 0.82780952381
- train logloss: 0.404721858603
- test auc: 0.851303249695
- test accuracy: 0.789520426288
- test logloss: 0.48051661294
- submission logloss: 0.96709
Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + blender model:
- train auc: 0.938893694115
- train accuracy: 0.866285714286
- train logloss: 0.335992067354
- test auc: 0.857180120114
- test accuracy: 0.790408525755
- test logloss: 0.471006110469
- submission logloss: 1.03103
Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + selected numerical features + blender model:
- train auc: 0.85434048273
- train accuracy: 0.789714285714
- train logloss: 0.477136359629
- test auc: 0.822459665106
- test accuracy: 0.764653641208
- test logloss: 0.516716042329
- submission logloss: 0.93550
Removed noninformative + evenized skewed features + whitened features + selected categorical features based on their numerized values + selected numerical features + blender model:
- train auc: 0.871670264279
- train accuracy: 0.786285714286
- train logloss: 0.466823652957
- test auc: 0.826993759057
- test accuracy: 0.766429840142
- test logloss: 0.509194934026
- submission logloss: 0.48284

Some observations:

1. similiar pattern to the tree-based blender model
2. it seems that the power of the ensemble model is greater than manual featuer selection



In [ ]:

	train_0	test_0	overall_0
D1176
0	0.518051	0.491489	0.510139
All	0.464762	0.441385	0.457745
1	0.425840	0.405488	0.419696

	Activity	D1	D2	D3	D5	D6	D7	D8	D9	...	D1767	D1768	D1769	D1770	D1772	D1775
0	1	0.000000	0.497009	0.10	0.132956	0.678031	0.273166	0.585445	0.743663	...	0	0	0	0	0	0
1	1	0.366667	0.606291	0.05	0.111209	0.803455	0.106105	0.411754	0.836582	...	1	1	1	1	1	1
2	1	0.033300	0.480124	0.00	0.209791	0.610350	0.356453	0.517720	0.679051	...	0	0	0	0	0	0

	D1	D2	D3	D5	D6	D7	D8	D9	D10	...	D1767	D1768	D1769	D1770	D1772	D1775
0	0.366667	0.611765	0.05	0.110435	0.803973	0.106075	0.473965	0.835617	0.106452	...	1	1	1	1	1	1
1	0.100000	0.758175	0.30	0.180128	0.621378	0.287144	0.503919	0.674919	0.403616	...	0	0	0	0	0	0
2	0.100000	0.658812	0.10	0.243421	0.640959	0.312765	0.279784	0.686775	0.280301	...	0	0	0	0	0	0