Resources

  1. main page
  2. solutions

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [60]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [189]:
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")
data.head(3)


Out[189]:
Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 1 0.000000 0.497009 0.10 0 0.132956 0.678031 0.273166 0.585445 0.743663 ... 0 0 0 0 0 0 0 0 0 0
1 1 0.366667 0.606291 0.05 0 0.111209 0.803455 0.106105 0.411754 0.836582 ... 1 1 1 1 0 1 0 0 1 0
2 1 0.033300 0.480124 0.00 0 0.209791 0.610350 0.356453 0.517720 0.679051 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1777 columns


In [238]:
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [239]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


942
834

In [240]:
## Know what you are dealing with
pd.value_counts(data.Activity)


Out[240]:
1    2034
0    1717
dtype: int64

In [241]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats


[]

In [242]:
## no need to remove heavy missing value features

In [243]:
## no need to impute missing values

In [244]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)


833

In [245]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))


396
380

In [246]:
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)

In [247]:
## find mutual redundante features
dsession.find_redundant_features()


Out[247]:
[]

In [218]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)

In [219]:
## find redudant features as the numerization of categorical may bring some
dsession.find_redundant_features()


Out[219]:
[]

In [224]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break


D27_0_NUMERIZED 0.745680221906
D1036_0_NUMERIZED 0.614941724349
D1004_0_NUMERIZED 0.599488497557
D979_0_NUMERIZED 0.587629784202
D1089_0_NUMERIZED 0.58216803237
D1109_0_NUMERIZED 0.579139734324
D1125_0_NUMERIZED 0.576009644748
D1061_0_NUMERIZED 0.573824307818
D954_0_NUMERIZED 0.571689866653
D1176_0_NUMERIZED 0.5716580568
D1087_0_NUMERIZED 0.570242518322
D1133_0_NUMERIZED 0.56965721702
D1143_0_NUMERIZED 0.568833341816
D993_0_NUMERIZED 0.567907675081
D1169_0_NUMERIZED 0.567796340594
D1168_0_NUMERIZED 0.567500508958
D996_0_NUMERIZED 0.566924750611
D1002_0_NUMERIZED 0.566698900651
D1106_0_NUMERIZED 0.564071406759
D1160_0_NUMERIZED 0.563944167345
D981_0_NUMERIZED 0.563606982899
D959_0_NUMERIZED 0.563400218852
D1083_0_NUMERIZED 0.562309140879
D1066_0_NUMERIZED 0.561809726181
D1281_0_NUMERIZED 0.561580695236
D1164_0_NUMERIZED 0.561478903705
D1163_0_NUMERIZED 0.560985850977
D1196_0_NUMERIZED 0.560467350366
D1180_0_NUMERIZED 0.559159965391
D1309_0_NUMERIZED 0.557184573493
D978_0_NUMERIZED 0.555788120928
D967_0_NUMERIZED 0.555050132329
D961_0_NUMERIZED 0.554741576751
D1403_0_NUMERIZED 0.554452107085
D1041_0_NUMERIZED 0.553828633958
D1059_0_NUMERIZED 0.553520078379
D1190_0_NUMERIZED 0.552756641897
D964_0_NUMERIZED 0.552559420806
D1285_0_NUMERIZED 0.552336751832
D51_0_NUMERIZED 0.551691011808
D1338_0_NUMERIZED 0.549063517915

In [248]:
#selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089', 
                            'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats


['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176']

In [249]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)


449

In [143]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:60])


Out[143]:
<munging.session.Session at 0x7fdb73cdb250>

In [144]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[60:260])


Out[144]:
<munging.session.Session at 0x7fdb73cdb250>

In [145]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[260:])


Out[145]:
<munging.session.Session at 0x7fdb73cdb250>

In [250]:
selected_numerical_feats = []
selected_numerical_feats += ["D%i_WHITE" % i for i in [6, 7, 10, 17, 46, 70, 126, 152, 177, 659]]
selected_numerical_feats += ["D%i_LOG1_WHITE" % i for i in [130, 131, 75, 88, 911, 103, 32, 47, 56, 16]]
selected_numerical_feats += ["D%i_LOG_WHITE" % i for i in [5]]
print selected_numerical_feats


['D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D131_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D103_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D16_LOG1_WHITE', 'D5_LOG_WHITE']

In [251]:
seleted_feats = selected_numerized_feats + selected_numerical_feats
print seleted_feats
selected_train, selected_test = dsession.get_data(selected_features=seleted_feats)
print selected_train.shape, selected_test.shape
selected_train_X, selected_train_y = selected_train.iloc[:, :-1], selected_train.iloc[:, -1]
selected_test_X, selected_test_y = selected_test.iloc[:, :-1], selected_test.iloc[:, -1]


['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D131_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D103_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D16_LOG1_WHITE', 'D5_LOG_WHITE']
(2625, 32) (1126, 32)

In [53]:
def logloss(ytrue, yhat):
    return -np.mean(np.log(np.where(ytrue == 1, yhat, 1-yhat)))

In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
tree = DecisionTreeClassifier(max_depth=3) 
tree.fit(selected_train_X, selected_train_y)
print roc_auc_score(selected_train_y, tree.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, tree.predict_proba(selected_test_X)[:, 1])
print tree.score(selected_train_X, selected_train_y)
print tree.score(selected_test_X, selected_test_y)


0.803888101967
0.765287990196
0.766476190476
0.735346358792

In [55]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
model = SVC(probability=True, kernel = "rbf", gamma = 0.05) 
model.fit(selected_train_X, selected_train_y)
print roc_auc_score(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, model.predict_proba(selected_test_X)[:, 1])
print model.score(selected_train_X, selected_train_y)
print model.score(selected_test_X, selected_test_y)
print logloss(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print logloss(selected_test_y, model.predict_proba(selected_test_X)[:, 1])


0.882925072046
0.807723813209
0.810285714286
0.747779751332
0.436796220876
0.533893538368

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [78]:
## load submission data
submission_data = pd.read_csv("data/predicting_biological_response/test.csv")
submission_data.head(3)


Out[78]:
D1 D2 D3 D4 D5 D6 D7 D8 D9 D10 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 0.366667 0.611765 0.05 0 0.110435 0.803973 0.106075 0.473965 0.835617 0.106452 ... 1 1 1 1 0 1 0 0 1 0
1 0.100000 0.758175 0.30 0 0.180128 0.621378 0.287144 0.503919 0.674919 0.403616 ... 0 0 0 0 0 0 0 0 0 0
2 0.100000 0.658812 0.10 0 0.243421 0.640959 0.312765 0.279784 0.686775 0.280301 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1776 columns


In [79]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)


(2501, 1776)
         D1        D2    D3  D4        D5        D6        D7        D8  \
0  0.366667  0.611765  0.05   0  0.110435  0.803973  0.106075  0.473965   
1  0.100000  0.758175  0.30   0  0.180128  0.621378  0.287144  0.503919   
2  0.100000  0.658812  0.10   0  0.243421  0.640959  0.312765  0.279784   

         D9       D10    ...     D1767  D1768  D1769  D1770  D1771  D1772  \
0  0.835617  0.106452    ...         1      1      1      1      0      1   
1  0.674919  0.403616    ...         0      0      0      0      0      0   
2  0.686775  0.280301    ...         0      0      0      0      0      0   

   D1773  D1774  D1775  D1776  
0      0      0      1      0  
1      0      0      0      0  
2      0      0      0      0  

[3 rows x 1776 columns]

In [58]:
selected_submission = transformed_submission.loc[:, seleted_feats]
print selected_submission.shape


(2501, 31)

In [59]:
submission_y = model.predict_proba(selected_submission)[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission1.csv", 
                           header = True, index = False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [28]:
## svm benchmark
#feats = ['D27', 'D1036', 'D995', 'D1087', 'D1061', 'D979', 'D1002', 'D1169', 'D996', 'D993', 
#         'D6', 'D7', 'D10', 'D17', 'D46', 'D70', 'D126', 'D152', 'D177', 'D659', 
#         'D130', 'D131', 'D75', 'D88', 'D911', 'D103', 'D32', 'D47', 'D56', 'D16', 'D5']
feats = submission_data.columns
X = data.iloc[:, 1:].loc[:, feats]
y = data.iloc[:, 0]
from sklearn.svm import SVC
svc = SVC(probability = True)
svc.fit(X, y)


Out[28]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
print roc_auc_score(y, svc.predict_proba(X)[:, 1])
print svc.score(X, y)


0.841063739378
0.763529725407

In [30]:
submission_yy = svc.predict_proba(submission_data.loc[:, feats])[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_yy
submission_solution.to_csv("data/predicting_biological_response/benchmark_submission.csv", 
                           header = True, index = False)

In [31]:
logloss(y, svc.predict_proba(X)[:, 1])


Out[31]:
0.49333113650297228

In [ ]:


In [ ]:


In [32]:
from munging import model

In [33]:
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

blender = model.ModelBlender(feature_names = data.columns[1:], 
                             target_name = "Activity", 
                             models = models,
                             blender = LogisticRegression())

In [34]:
blender.fit(data)


Out[34]:
<munging.model.ModelBlender at 0x7fb1bb5528d0>

In [35]:
yhat = blender.predict(data)

In [36]:
plt.plot(y, yhat, ".")


Out[36]:
[<matplotlib.lines.Line2D at 0x7fb1bb480f50>]

In [ ]:


In [256]:
selected_features = seleted_feats
print selected_features
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]

blender = dsession.blend_models(models = models, 
                                blender = LogisticRegression(), 
                                feature_names = selected_features)


['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D131_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D103_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D16_LOG1_WHITE', 'D5_LOG_WHITE']

In [260]:
blender.blender.coef_


Out[260]:
array([[ 1.88251145,  1.50094923,  0.65155139,  1.45427915]])

In [257]:
train_matrix, test_matrix = dsession.get_data(selected_features)
trainyhat = blender.predict(train_matrix)
testyhat = blender.predict(test_matrix)

train_y, test_y = train_matrix.Activity, test_matrix.Activity

print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)


train auc: 1.0
train accuracy: 1.0
train logloss: 0.131399582994
test auc: 0.852990758542
test accuracy: 0.779751332149
test logloss: 0.472583482101

In [258]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)


(2501, 943)
   D27  D28  D51  D94  D952  D953  D954  D955  D956  D957  ...   \
0    0    0    1    1     1     0     0     1     1     0  ...    
1    1    1    0    1     0     0     1     0     0     1  ...    
2    0    1    0    1     1     0     0     0     0     1  ...    

   D558_LOG1_WHITE  D559_LOG1_WHITE  D556_LOG1_WHITE  D557_LOG1_WHITE  \
0         6.756420         5.597796         3.652312         7.913760   
1        -0.215938        -0.200025        -0.177183        -0.199862   
2        -0.215938        -0.200025        -0.177183        -0.199862   

   D554_LOG1_WHITE  D555_LOG1_WHITE  D223_LOG1_WHITE  D389_LOG1_WHITE  \
0         7.057772        -0.186079        -0.418874         4.006732   
1        -0.238913        -0.186079        -0.418874        -0.291488   
2        -0.238913        -0.186079        -0.418874        -0.291488   

   D388_LOG1_WHITE  D386_LOG1_WHITE  
0         2.141151        -0.242842  
1         4.296623        -0.242842  
2        -0.268462         3.833368  

[3 rows x 943 columns]

In [162]:
submission_y = blender.predict(transformed_submission.loc[:, selected_features])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission2.csv", 
                           header = True, index = False)

Experimental Results

blender models are tree models

models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]
  1. All features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.107362817617
    • test auc: 0.87296746996
    • test accuracy: 0.790408525755
    • test logloss: 0.445716523203
    • submission logloss: 0.41044
  2. Removed noninformative + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.0980945362982
    • test auc: 0.864868063708
    • test accuracy: 0.794849023091
    • test logloss: 0.455761545362
    • submission logloss: 0.41474
  3. Removed noninformative + evenized skewed features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.0973121344046
    • test auc: 0.864688928483
    • test accuracy: 0.793960923623
    • test logloss: 0.45576819434
    • submission logloss: 0.41373
  4. Removed noninformative + evenized skewed features + whitened features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.100104966799
    • test auc: 0.863962791055
    • test accuracy: 0.794849023091
    • test logloss: 0.456526266183
    • submission logloss: 0.41375
  5. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.106581348311
    • test auc: 0.876126068811
    • test accuracy: 0.801065719361
    • test logloss: 0.441049104199
    • submission logloss: 1.02392
  6. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.0929543788381
    • test auc: 0.879660143526
    • test accuracy: 0.809058614565
    • test logloss: 0.434795974692
    • submission logloss: 1.02953
  7. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + selected numerical features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.13909023595
    • test auc: 0.855300157777
    • test accuracy: 0.781527531083
    • test logloss: 0.473516181104
    • submission logloss: 0.97276
  8. Removed noninformative + evenized skewed features + whitened features + selected categorical features based on their numerized values + selected numerical features + blender model:
    • train auc: 1.0
    • train accuracy: 1.0
    • train logloss: 0.135945958172
    • test auc: 0.85161205708
    • test accuracy: 0.775310834813
    • test logloss: 0.474685622346
    • submission logloss: 0.44639

Some observations:

1. there is a high risk to numerize categorical features, specially when there are many levels in the categorical feature, because what is observed in the training data (and even validation data) may not well apply to the new data (e.g., the submission data) - this can be seen by observing that the logloss, auc, and accuracy of train and test not change too much along different data transformation, but the submission accuracy drops a lot
2. feature selection based on numerized categorical, and numerical density, however, seems to improve the generality
3. considerting the blender model used here is a combination of tree-based models, it should be the most robust to the data transforms

As a trend in modern data mining activities, a lot of people prefer using an ensemble model to manual feature selection


In [170]:
from IPython.display import display_html
for f in ['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176']:
    print '='*20, f, '='*20
    print(pd.value_counts(data[f]))
    print(pd.value_counts(submission_data[f]))


==================== D27 ====================
1    1953
0    1798
dtype: int64
1    1324
0    1177
dtype: int64
==================== D1036 ====================
1    1981
0    1770
dtype: int64
1    1318
0    1183
dtype: int64
==================== D1004 ====================
1    2069
0    1682
dtype: int64
1    1403
0    1098
dtype: int64
==================== D979 ====================
1    2378
0    1373
dtype: int64
1    1568
0     933
dtype: int64
==================== D1089 ====================
0    2403
1    1348
dtype: int64
0    1643
1     858
dtype: int64
==================== D1109 ====================
1    2112
0    1639
dtype: int64
1    1381
0    1120
dtype: int64
==================== D1125 ====================
0    2068
1    1683
dtype: int64
0    1421
1    1080
dtype: int64
==================== D1061 ====================
0    2805
1     946
dtype: int64
0    1819
1     682
dtype: int64
==================== D954 ====================
0    2509
1    1242
dtype: int64
0    1649
1     852
dtype: int64
==================== D1176 ====================
1    2173
0    1578
dtype: int64
1    1400
0    1101
dtype: int64

In [179]:
dsession.print_categorial_crosstable(feature_names=["D1176"])


Out[179]:
train_0 test_0 overall_0
D1176
0 0.518051 0.491489 0.510139
All 0.464762 0.441385 0.457745
1 0.425840 0.405488 0.419696

In [ ]:


In [ ]:


In [252]:
selected_features = seleted_feats
print selected_features


blender = SVC(probability = True)


['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176', 'D6_WHITE', 'D7_WHITE', 'D10_WHITE', 'D17_WHITE', 'D46_WHITE', 'D70_WHITE', 'D126_WHITE', 'D152_WHITE', 'D177_WHITE', 'D659_WHITE', 'D130_LOG1_WHITE', 'D131_LOG1_WHITE', 'D75_LOG1_WHITE', 'D88_LOG1_WHITE', 'D911_LOG1_WHITE', 'D103_LOG1_WHITE', 'D32_LOG1_WHITE', 'D47_LOG1_WHITE', 'D56_LOG1_WHITE', 'D16_LOG1_WHITE', 'D5_LOG_WHITE']

In [253]:
train_matrix, test_matrix = dsession.get_data(selected_features)
train_y, test_y = train_matrix.Activity, test_matrix.Activity

blender.fit(train_matrix.iloc[:, :-1], train_y)

trainyhat = blender.predict_proba(train_matrix.iloc[:, :-1])[:, 1]
testyhat = blender.predict_proba(test_matrix.iloc[:, :-1])[:, 1]


print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)


train auc: 0.871670264279
train accuracy: 0.786285714286
train logloss: 0.466823652957
test auc: 0.826993759057
test accuracy: 0.766429840142
test logloss: 0.509194934026

In [254]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)


(2501, 943)
   D27  D28  D51  D94  D952  D953  D954  D955  D956  D957  ...   \
0    0    0    1    1     1     0     0     1     1     0  ...    
1    1    1    0    1     0     0     1     0     0     1  ...    
2    0    1    0    1     1     0     0     0     0     1  ...    

   D558_LOG1_WHITE  D559_LOG1_WHITE  D556_LOG1_WHITE  D557_LOG1_WHITE  \
0         6.756420         5.597796         3.652312         7.913760   
1        -0.215938        -0.200025        -0.177183        -0.199862   
2        -0.215938        -0.200025        -0.177183        -0.199862   

   D554_LOG1_WHITE  D555_LOG1_WHITE  D223_LOG1_WHITE  D389_LOG1_WHITE  \
0         7.057772        -0.186079        -0.418874         4.006732   
1        -0.238913        -0.186079        -0.418874        -0.291488   
2        -0.238913        -0.186079        -0.418874        -0.291488   

   D388_LOG1_WHITE  D386_LOG1_WHITE  
0         2.141151        -0.242842  
1         4.296623        -0.242842  
2        -0.268462         3.833368  

[3 rows x 943 columns]

In [255]:
submission_y = (blender.predict_proba(transformed_submission)[:, 1] 
                if selected_features is None 
                else blender.predict_proba(transformed_submission.loc[:, selected_features])[:, 1])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission2.csv", 
                           header = True, index = False)

Experimental Results

blender models are SVM

  1. All features + blender model:
    • train auc: 0.833257394551
    • train accuracy: 0.758476190476
    • train logloss: 0.505849445063
    • test auc: 0.792279911584
    • test accuracy: 0.722912966252
    • test logloss: 0.547326593429
    • submission logloss: 0.52190
  2. Removed noninformative + blender model:
    • train auc: 0.857131439239
    • train accuracy: 0.776380952381
    • train logloss: 0.473122160467
    • test auc: 0.815580286169
    • test accuracy: 0.751332149201
    • test logloss: 0.521406446231
    • submission logloss: 0.49046
  3. Removed noninformative + evenized skewed features + blender model:
    • train auc: 0.856609299341
    • train accuracy: 0.775619047619
    • train logloss: 0.473663198953
    • test auc: 0.815685847997
    • test accuracy: 0.753108348135
    • test logloss: 0.520802184118
    • submission logloss: 0.48935
  4. Removed noninformative + evenized skewed features + whitened features + blender model:
    • train auc: 0.925573770492
    • train accuracy: 0.843428571429
    • train logloss: 0.366437909086
    • test auc: 0.83826648284
    • test accuracy: 0.771758436945
    • test logloss: 0.493848022056
    • submission logloss: 0.44420
  5. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + blender model:
    • train auc: 0.903969376425
    • train accuracy: 0.82780952381
    • train logloss: 0.404721858603
    • test auc: 0.851303249695
    • test accuracy: 0.789520426288
    • test logloss: 0.48051661294
    • submission logloss: 0.96709
  6. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + blender model:
    • train auc: 0.938893694115
    • train accuracy: 0.866285714286
    • train logloss: 0.335992067354
    • test auc: 0.857180120114
    • test accuracy: 0.790408525755
    • test logloss: 0.471006110469
    • submission logloss: 1.03103
  7. Removed noninformative + evenized skewed features + whitened features + numerized categorical feats + selected numerized features + selected numerical features + blender model:
    • train auc: 0.85434048273
    • train accuracy: 0.789714285714
    • train logloss: 0.477136359629
    • test auc: 0.822459665106
    • test accuracy: 0.764653641208
    • test logloss: 0.516716042329
    • submission logloss: 0.93550
  8. Removed noninformative + evenized skewed features + whitened features + selected categorical features based on their numerized values + selected numerical features + blender model:
    • train auc: 0.871670264279
    • train accuracy: 0.786285714286
    • train logloss: 0.466823652957
    • test auc: 0.826993759057
    • test accuracy: 0.766429840142
    • test logloss: 0.509194934026
    • submission logloss: 0.48284

Some observations:

1. similiar pattern to the tree-based blender model
2. it seems that the power of the ensemble model is greater than manual featuer selection

In [ ]: