In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from munging import session
from munging import transform

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

In [28]:
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")

Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 1 0.000000 0.497009 0.10 0 0.132956 0.678031 0.273166 0.585445 0.743663 ... 0 0 0 0 0 0 0 0 0 0
1 1 0.366667 0.606291 0.05 0 0.111209 0.803455 0.106105 0.411754 0.836582 ... 1 1 1 1 0 1 0 0 1 0
2 1 0.033300 0.480124 0.00 0 0.209791 0.610350 0.356453 0.517720 0.679051 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1777 columns

In [29]:
data_index, submission_index = train_test_split(range(data.shape[0]), test_size = 0.3, random_state = 0)
submission_data = data.iloc[submission_index, :]
data = data.iloc[data_index, :]

In [30]:
print data.shape, submission_data.shape

(2625, 1777) (1126, 1777)

In [31]:
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()


In [32]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


In [33]:
## Know what you are dealing with

1    1405
0    1220
dtype: int64

In [34]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats


In [16]:
## no need to remove heavy missing value features

In [17]:
## no need to impute missing values

In [35]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)


In [36]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))


In [37]:
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
#scaler = dsession.minmax_scale_features(auto_remove=True)

In [21]:
## find mutual redundante features


In [38]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)

In [23]:
## find redudant features as the numerization of categorical may bring some


In [39]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break

D27_0_NUMERIZED 0.736503093482
D1087_0_NUMERIZED 0.595845801795
D1036_0_NUMERIZED 0.595249271451
D1083_0_NUMERIZED 0.592641488522
D1061_0_NUMERIZED 0.590708469427
D1089_0_NUMERIZED 0.585238644735
D954_0_NUMERIZED 0.583778286295
D993_0_NUMERIZED 0.582823185798
D959_0_NUMERIZED 0.581251996584
D979_0_NUMERIZED 0.576329806307
D1143_0_NUMERIZED 0.573911087641
D995_0_NUMERIZED 0.572486586217
D1180_0_NUMERIZED 0.572007406104
D1109_0_NUMERIZED 0.570798046771
D992_0_NUMERIZED 0.570752410569
D1004_0_NUMERIZED 0.57021455534
D1169_0_NUMERIZED 0.569226857556
D1155_0_NUMERIZED 0.568689002327
D1002_0_NUMERIZED 0.568007719037
D1125_0_NUMERIZED 0.56800119958
D1150_0_NUMERIZED 0.567143890943
D988_0_NUMERIZED 0.566772281875
D1133_0_NUMERIZED 0.565712870061
D1338_0_NUMERIZED 0.565445572311
D996_0_NUMERIZED 0.563515812944
D1390_0_NUMERIZED 0.56255093326
D1193_0_NUMERIZED 0.561126431836
D1005_0_NUMERIZED 0.560569018235
D1341_0_NUMERIZED 0.560497304204
D1196_0_NUMERIZED 0.56011917568
D1281_0_NUMERIZED 0.558769648014
D1066_0_NUMERIZED 0.558596882396
D1190_0_NUMERIZED 0.557022433453
D1168_0_NUMERIZED 0.556892044306
D1106_0_NUMERIZED 0.556474799038
D1077_0_NUMERIZED 0.556197722102
D1128_0_NUMERIZED 0.555620750129
D1160_0_NUMERIZED 0.554998141955
D1113_0_NUMERIZED 0.554049560915
D1434_0_NUMERIZED 0.553397615183
D967_0_NUMERIZED 0.55335523871
D1164_0_NUMERIZED 0.552921694798
D1337_0_NUMERIZED 0.552784786194
D1445_0_NUMERIZED 0.552768487551
D1096_0_NUMERIZED 0.551839464883
D1403_0_NUMERIZED 0.551011493803
D1441_0_NUMERIZED 0.550828948998
D960_0_NUMERIZED 0.550467119117
D1174_0_NUMERIZED 0.548889410445

In [40]:
selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
#selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089', 
#                            'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats

['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED']

In [249]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)


In [143]:

<munging.session.Session at 0x7fdb73cdb250>

In [144]:

<munging.session.Session at 0x7fdb73cdb250>

In [145]:

<munging.session.Session at 0x7fdb73cdb250>

In [43]:
selected_numerical_feats = []
selected_numerical_feats += ["D%i_WHITE" % i for i in [6, 7, 10, 17, 46, 70, 126, 152, 177, 659]]
selected_numerical_feats += ["D%i_LOG1_WHITE" % i for i in [130, 75, 88, 911, 32, 47, 56 ]]
selected_numerical_feats += ["D%i_LOG_WHITE" % i for i in [5]]
print selected_numerical_feats

['D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D5_LOG_WHITE']

In [44]:
for f in selected_numerical_feats:
    print f, f in dsession.get_all_input_features()

D10_WHITE True
D17_WHITE True
D46_WHITE True
D70_WHITE True
D126_WHITE True
D152_WHITE True
D177_WHITE True
D659_WHITE True
D130_LOG1_WHITE True
D911_LOG1_WHITE True

In [45]:
seleted_feats = selected_numerized_feats + selected_numerical_feats
print seleted_feats
selected_train, selected_test = dsession.get_data(selected_features=seleted_feats)
print selected_train.shape, selected_test.shape
selected_train_X, selected_train_y = selected_train.iloc[:, :-1], selected_train.iloc[:, -1]
selected_test_X, selected_test_y = selected_test.iloc[:, :-1], selected_test.iloc[:, -1]

['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D5_LOG_WHITE']
(1837, 29) (788, 29)

In [50]:
def logloss(ytrue, yhat):
    return -np.mean(np.log(np.where(ytrue == 1, yhat, 1-yhat)))

In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
tree = DecisionTreeClassifier(max_depth=3), selected_train_y)
print roc_auc_score(selected_train_y, tree.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, tree.predict_proba(selected_test_X)[:, 1])
print tree.score(selected_train_X, selected_train_y)
print tree.score(selected_test_X, selected_test_y)


In [55]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
model = SVC(probability=True, kernel = "rbf", gamma = 0.05), selected_train_y)
print roc_auc_score(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, model.predict_proba(selected_test_X)[:, 1])
print model.score(selected_train_X, selected_train_y)
print model.score(selected_test_X, selected_test_y)
print logloss(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print logloss(selected_test_y, model.predict_proba(selected_test_X)[:, 1])


In [78]:
## load submission data
submission_data = pd.read_csv("data/predicting_biological_response/test.csv")

D1 D2 D3 D4 D5 D6 D7 D8 D9 D10 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 0.366667 0.611765 0.05 0 0.110435 0.803973 0.106075 0.473965 0.835617 0.106452 ... 1 1 1 1 0 1 0 0 1 0
1 0.100000 0.758175 0.30 0 0.180128 0.621378 0.287144 0.503919 0.674919 0.403616 ... 0 0 0 0 0 0 0 0 0 0
2 0.100000 0.658812 0.10 0 0.243421 0.640959 0.312765 0.279784 0.686775 0.280301 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1776 columns

In [79]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)

(2501, 1776)
         D1        D2    D3  D4        D5        D6        D7        D8  \
0  0.366667  0.611765  0.05   0  0.110435  0.803973  0.106075  0.473965   
1  0.100000  0.758175  0.30   0  0.180128  0.621378  0.287144  0.503919   
2  0.100000  0.658812  0.10   0  0.243421  0.640959  0.312765  0.279784   

         D9       D10    ...     D1767  D1768  D1769  D1770  D1771  D1772  \
0  0.835617  0.106452    ...         1      1      1      1      0      1   
1  0.674919  0.403616    ...         0      0      0      0      0      0   
2  0.686775  0.280301    ...         0      0      0      0      0      0   

   D1773  D1774  D1775  D1776  
0      0      0      1      0  
1      0      0      0      0  
2      0      0      0      0  

[3 rows x 1776 columns]

In [58]:
selected_submission = transformed_submission.loc[:, seleted_feats]
print selected_submission.shape

(2501, 31)

In [59]:
submission_y = model.predict_proba(selected_submission)[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
                           header = True, index = False)

In [28]:
## svm benchmark
#feats = ['D27', 'D1036', 'D995', 'D1087', 'D1061', 'D979', 'D1002', 'D1169', 'D996', 'D993', 
#         'D6', 'D7', 'D10', 'D17', 'D46', 'D70', 'D126', 'D152', 'D177', 'D659', 
#         'D130', 'D131', 'D75', 'D88', 'D911', 'D103', 'D32', 'D47', 'D56', 'D16', 'D5']
feats = submission_data.columns
X = data.iloc[:, 1:].loc[:, feats]
y = data.iloc[:, 0]
from sklearn.svm import SVC
svc = SVC(probability = True), y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
print roc_auc_score(y, svc.predict_proba(X)[:, 1])
print svc.score(X, y)


In [30]:
submission_yy = svc.predict_proba(submission_data.loc[:, feats])[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_yy
                           header = True, index = False)

In [31]:
logloss(y, svc.predict_proba(X)[:, 1])


In [32]:
from munging import model

In [33]:
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

blender = model.ModelBlender(feature_names = data.columns[1:], 
                             target_name = "Activity", 
                             models = models,
                             blender = LogisticRegression())

In [34]:

<munging.model.ModelBlender at 0x7fb1bb5528d0>

In [35]:
yhat = blender.predict(data)

In [36]:
plt.plot(y, yhat, ".")

[<matplotlib.lines.Line2D at 0x7fb1bb480f50>]

In [46]:
selected_features = seleted_feats
print selected_features
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

blender = dsession.blend_models(models = models, 
                                blender = LogisticRegression(), 
                                feature_names = selected_features)

['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D5_LOG_WHITE']

In [47]:

array([[ 1.84255462,  1.65119793,  1.23298551,  0.40130147]])

In [51]:
from sklearn.metrics import roc_auc_score
train_matrix, test_matrix = dsession.get_data(selected_features)
trainyhat = blender.predict(train_matrix)
testyhat = blender.predict(test_matrix)

train_y, test_y = train_matrix.Activity, test_matrix.Activity

print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)

train auc: 1.0
train accuracy: 1.0
train logloss: 0.155293915676
test auc: 0.85355994967
test accuracy: 0.779187817259
test logloss: 0.47765519017

In [53]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)

submission_matrix = transformed_submission.loc[:, selected_features]
submission_y = transformed_submission.Activity
submission_yhat = blender.predict(submission_matrix)

print 'submission auc:', roc_auc_score(submission_y, submission_yhat)
print 'submission accuracy:', np.mean(submission_y == (submission_yhat>=0.5))
print 'submission logloss:', logloss(submission_y, submission_yhat)

(1126, 943)
   Activity  D2_WHITE  D6_WHITE  D7_WHITE  D8_WHITE  D9_WHITE  D10_WHITE  \
0         1 -0.000711 -0.976777  1.039241 -0.063431 -0.784041   0.622562   
1         1 -0.000711 -0.997865  1.054865  0.102651 -0.788162   0.624411   
2         0  0.038867 -0.977859  1.306818  0.706703 -0.445959   0.771932   

   D15_WHITE  D16_WHITE  D17_WHITE    ...      D1440_0_NUMERIZED  \
0   0.971653   0.178595  -0.767552    ...               0.455594   
1   0.971475   0.179414  -0.777047    ...               0.455594   
2   1.716197   1.010969  -0.787602    ...               0.455594   

   D1441_0_NUMERIZED  D1442_0_NUMERIZED  D1443_0_NUMERIZED  D1444_0_NUMERIZED  \
0           0.465247           0.459459           0.452613           0.446243   
1           0.465247           0.459459           0.452613           0.446243   
2           0.465247           0.459459           0.452613           0.446243   

   D1445_0_NUMERIZED  D1446_0_NUMERIZED  D1447_0_NUMERIZED  D1448_0_NUMERIZED  \
0            0.46683           0.457091           0.465978            0.46089   
1            0.46683           0.457091           0.465978            0.46089   
2            0.46683           0.457091           0.465978            0.46089   

0           0.456697  
1           0.456697  
2           0.456697  

[3 rows x 943 columns]
submission auc: 0.843659732641
submission accuracy: 0.768206039076
submission logloss: 0.487532549337

In [258]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)

(2501, 943)
   D27  D28  D51  D94  D952  D953  D954  D955  D956  D957  ...   \
0    0    0    1    1     1     0     0     1     1     0  ...    
1    1    1    0    1     0     0     1     0     0     1  ...    
2    0    1    0    1     1     0     0     0     0     1  ...    

   D558_LOG1_WHITE  D559_LOG1_WHITE  D556_LOG1_WHITE  D557_LOG1_WHITE  \
0         6.756420         5.597796         3.652312         7.913760   
1        -0.215938        -0.200025        -0.177183        -0.199862   
2        -0.215938        -0.200025        -0.177183        -0.199862   

   D554_LOG1_WHITE  D555_LOG1_WHITE  D223_LOG1_WHITE  D389_LOG1_WHITE  \
0         7.057772        -0.186079        -0.418874         4.006732   
1        -0.238913        -0.186079        -0.418874        -0.291488   
2        -0.238913        -0.186079        -0.418874        -0.291488   

   D388_LOG1_WHITE  D386_LOG1_WHITE  
0         2.141151        -0.242842  
1         4.296623        -0.242842  
2        -0.268462         3.833368  

[3 rows x 943 columns]

In [162]:
submission_y = blender.predict(transformed_submission.loc[:, selected_features])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
                           header = True, index = False)

Experimental Results

blender models are tree models

models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]
  1. All features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.107362817617
    • test auc: 0.87296746996
    • test accuracy: 0.790408525755
    • test logloss: 0.445716523203
    • submission logloss: 0.41044
  2. Removed noninformative + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.0980945362982
    • test auc: 0.864868063708
    • test accuracy: 0.794849023091
    • test logloss: 0.455761545362
    • submission logloss: 0.41474
  3. Removed noninformative + evenized skewed features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.0973121344046
    • test auc: 0.864688928483
    • test accuracy: 0.793960923623
    • test logloss: 0.45576819434
    • submission logloss: 0.41373
  4. Removed noninformative + evenized skewed features + whitened features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.100104966799
    • test auc: 0.863962791055
    • test accuracy: 0.794849023091
    • test logloss: 0.456526266183
    • submission logloss: 0.41375
  5. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.106581348311
    • test auc: 0.876126068811
    • test accuracy: 0.801065719361
    • test logloss: 0.441049104199
    • submission logloss: 1.02392
  6. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.0929543788381
    • test auc: 0.879660143526
    • test accuracy: 0.809058614565
    • test logloss: 0.434795974692
    • submission logloss: 1.02953
  7. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + selected numerical features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.13909023595
    • test auc: 0.855300157777
    • test accuracy: 0.781527531083
    • test logloss: 0.473516181104
    • submission logloss: 0.97276
  8. Removed noninformative + evenized skewed features + whitened features + selected categorical features based on their numerized values + selected numerical features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.135945958172
    • test auc: 0.85161205708
    • test accuracy: 0.775310834813
    • test logloss: 0.474685622346
    • submission logloss: 0.44639

Some observations:

1. there is a high risk to numerize categorical features, specially when there are many levels in the categorical feature, because what is observed in the training data (and even validation data) may not well apply to the new data (e.g., the submission data) - this can be seen by observing that the logloss, auc, and accuracy of train and test not change too much along different data transformation, but the submission accuracy drops a lot
2. feature selection based on numerized categorical, and numerical density, however, seems to improve the generality
3. considerting the blender model used here is a combination of tree-based models, it should be the most robust to the data transforms

As a trend in modern data mining activities, a lot of people prefer using an ensemble model to manual feature selection

In [170]:
from IPython.display import display_html
for f in ['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176']:
    print '='*20, f, '='*20

==================== D27 ====================
1    1953
0    1798
dtype: int64
1    1324
0    1177
dtype: int64
==================== D1036 ====================
1    1981
0    1770
dtype: int64
1    1318
0    1183
dtype: int64
==================== D1004 ====================
1    2069
0    1682
dtype: int64
1    1403
0    1098
dtype: int64
==================== D979 ====================
1    2378
0    1373
dtype: int64
1    1568
0     933
dtype: int64
==================== D1089 ====================
0    2403
1    1348
dtype: int64
0    1643
1     858
dtype: int64
==================== D1109 ====================
1    2112
0    1639
dtype: int64
1    1381
0    1120
dtype: int64
==================== D1125 ====================
0    2068
1    1683
dtype: int64
0    1421
1    1080
dtype: int64
==================== D1061 ====================
0    2805
1     946
dtype: int64
0    1819
1     682
dtype: int64
==================== D954 ====================
0    2509
1    1242
dtype: int64
0    1649
1     852
dtype: int64
==================== D1176 ====================
1    2173
0    1578
dtype: int64
1    1400
0    1101
dtype: int64

In [179]:

train_0 test_0 overall_0
0 0.518051 0.491489 0.510139
All 0.464762 0.441385 0.457745
1 0.425840 0.405488 0.419696

In [252]:
selected_features = seleted_feats
print selected_features

blender = SVC(probability = True)

['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D131_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D103_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D16_LOG1_WHITE', 'D5_LOG_WHITE']

In [253]:
train_matrix, test_matrix = dsession.get_data(selected_features)
train_y, test_y = train_matrix.Activity, test_matrix.Activity[:, :-1], train_y)

trainyhat = blender.predict_proba(train_matrix.iloc[:, :-1])[:, 1]
testyhat = blender.predict_proba(test_matrix.iloc[:, :-1])[:, 1]

print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)

train auc: 0.871670264279
train accuracy: 0.786285714286
train logloss: 0.466823652957
test auc: 0.826993759057
test accuracy: 0.766429840142
test logloss: 0.509194934026

In [254]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)

(2501, 943)
   D27  D28  D51  D94  D952  D953  D954  D955  D956  D957  ...   \
0    0    0    1    1     1     0     0     1     1     0  ...    
1    1    1    0    1     0     0     1     0     0     1  ...    
2    0    1    0    1     1     0     0     0     0     1  ...    

   D558_LOG1_WHITE  D559_LOG1_WHITE  D556_LOG1_WHITE  D557_LOG1_WHITE  \
0         6.756420         5.597796         3.652312         7.913760   
1        -0.215938        -0.200025        -0.177183        -0.199862   
2        -0.215938        -0.200025        -0.177183        -0.199862   

   D554_LOG1_WHITE  D555_LOG1_WHITE  D223_LOG1_WHITE  D389_LOG1_WHITE  \
0         7.057772        -0.186079        -0.418874         4.006732   
1        -0.238913        -0.186079        -0.418874        -0.291488   
2        -0.238913        -0.186079        -0.418874        -0.291488   

   D388_LOG1_WHITE  D386_LOG1_WHITE  
0         2.141151        -0.242842  
1         4.296623        -0.242842  
2        -0.268462         3.833368  

[3 rows x 943 columns]

In [255]:
submission_y = (blender.predict_proba(transformed_submission)[:, 1] 
                if selected_features is None 
                else blender.predict_proba(transformed_submission.loc[:, selected_features])[:, 1])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
                           header = True, index = False)

Experimental Results

blender models are SVM

  1. All features + blender model:
    • train auc: 0.833257394551
    • train accuracy: 0.758476190476
    • train logloss: 0.505849445063
    • test auc: 0.792279911584
    • test accuracy: 0.722912966252
    • test logloss: 0.547326593429
    • submission logloss: 0.52190
  2. Removed noninformative + blender model:
    • train auc: 0.857131439239
    • train accuracy: 0.776380952381
    • train logloss: 0.473122160467
    • test auc: 0.815580286169
    • test accuracy: 0.751332149201
    • test logloss: 0.521406446231
    • submission logloss: 0.49046
  3. Removed noninformative + evenized skewed features + blender model:
    • train auc: 0.856609299341
    • train accuracy: 0.775619047619
    • train logloss: 0.473663198953
    • test auc: 0.815685847997
    • test accuracy: 0.753108348135
    • test logloss: 0.520802184118
    • submission logloss: 0.48935
  4. Removed noninformative + evenized skewed features + whitened features + blender model:
    • train auc: 0.925573770492
    • train accuracy: 0.843428571429
    • train logloss: 0.366437909086
    • test auc: 0.83826648284
    • test accuracy: 0.771758436945
    • test logloss: 0.493848022056
    • submission logloss: 0.44420
  5. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + blender model:
    • train auc: 0.903969376425
    • train accuracy: 0.82780952381
    • train logloss: 0.404721858603
    • test auc: 0.851303249695
    • test accuracy: 0.789520426288
    • test logloss: 0.48051661294
    • submission logloss: 0.96709
  6. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + blender model:
    • train auc: 0.938893694115
    • train accuracy: 0.866285714286
    • train logloss: 0.335992067354
    • test auc: 0.857180120114
    • test accuracy: 0.790408525755
    • test logloss: 0.471006110469
    • submission logloss: 1.03103
  7. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + selected numerical features + blender model:
    • train auc: 0.85434048273
    • train accuracy: 0.789714285714
    • train logloss: 0.477136359629
    • test auc: 0.822459665106
    • test accuracy: 0.764653641208
    • test logloss: 0.516716042329
    • submission logloss: 0.93550
  8. Removed noninformative + evenized skewed features + whitened features + selected categorical features based on their numerized values + selected numerical features + blender model:
    • train auc: 0.871670264279
    • train accuracy: 0.786285714286
    • train logloss: 0.466823652957
    • test auc: 0.826993759057
    • test accuracy: 0.766429840142
    • test logloss: 0.509194934026
    • submission logloss: 0.48284

Some observations:

1. similiar pattern to the tree-based blender model
2. it seems that the power of the ensemble model is greater than manual featuer selection

