Resources

  1. main page
  2. solutions

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

In [28]:
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")
data.head(3)


Out[28]:
Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 1 0.000000 0.497009 0.10 0 0.132956 0.678031 0.273166 0.585445 0.743663 ... 0 0 0 0 0 0 0 0 0 0
1 1 0.366667 0.606291 0.05 0 0.111209 0.803455 0.106105 0.411754 0.836582 ... 1 1 1 1 0 1 0 0 1 0
2 1 0.033300 0.480124 0.00 0 0.209791 0.610350 0.356453 0.517720 0.679051 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1777 columns


In [29]:
data_index, submission_index = train_test_split(range(data.shape[0]), test_size = 0.3, random_state = 0)
submission_data = data.iloc[submission_index, :]
data = data.iloc[data_index, :]

In [30]:
print data.shape, submission_data.shape


(2625, 1777) (1126, 1777)

In [31]:
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [32]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


942
834

In [33]:
## Know what you are dealing with
pd.value_counts(data.Activity)


Out[33]:
1    1405
0    1220
dtype: int64

In [34]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats


[]

In [16]:
## no need to remove heavy missing value features

In [17]:
## no need to impute missing values

In [35]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)


834

In [36]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))


388
371

In [37]:
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)

In [21]:
## find mutual redundante features
dsession.find_redundant_features()


Out[21]:
['D203_LOG_WHITE']

In [38]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)

In [23]:
## find redudant features as the numerization of categorical may bring some
dsession.find_redundant_features()


Out[23]:
['D195_LOG1_WHITE']

In [39]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break


D27_0_NUMERIZED 0.736503093482
D1087_0_NUMERIZED 0.595845801795
D1036_0_NUMERIZED 0.595249271451
D1083_0_NUMERIZED 0.592641488522
D1061_0_NUMERIZED 0.590708469427
D1089_0_NUMERIZED 0.585238644735
D954_0_NUMERIZED 0.583778286295
D993_0_NUMERIZED 0.582823185798
D959_0_NUMERIZED 0.581251996584
D979_0_NUMERIZED 0.576329806307
D1143_0_NUMERIZED 0.573911087641
D995_0_NUMERIZED 0.572486586217
D1180_0_NUMERIZED 0.572007406104
D1109_0_NUMERIZED 0.570798046771
D992_0_NUMERIZED 0.570752410569
D1004_0_NUMERIZED 0.57021455534
D1169_0_NUMERIZED 0.569226857556
D1155_0_NUMERIZED 0.568689002327
D1002_0_NUMERIZED 0.568007719037
D1125_0_NUMERIZED 0.56800119958
D1150_0_NUMERIZED 0.567143890943
D988_0_NUMERIZED 0.566772281875
D1133_0_NUMERIZED 0.565712870061
D1338_0_NUMERIZED 0.565445572311
D996_0_NUMERIZED 0.563515812944
D1390_0_NUMERIZED 0.56255093326
D1193_0_NUMERIZED 0.561126431836
D1005_0_NUMERIZED 0.560569018235
D1341_0_NUMERIZED 0.560497304204
D1196_0_NUMERIZED 0.56011917568
D1281_0_NUMERIZED 0.558769648014
D1066_0_NUMERIZED 0.558596882396
D1190_0_NUMERIZED 0.557022433453
D1168_0_NUMERIZED 0.556892044306
D1106_0_NUMERIZED 0.556474799038
D1077_0_NUMERIZED 0.556197722102
D1128_0_NUMERIZED 0.555620750129
D1160_0_NUMERIZED 0.554998141955
D1113_0_NUMERIZED 0.554049560915
D1434_0_NUMERIZED 0.553397615183
D967_0_NUMERIZED 0.55335523871
D1164_0_NUMERIZED 0.552921694798
D1337_0_NUMERIZED 0.552784786194
D1445_0_NUMERIZED 0.552768487551
D1096_0_NUMERIZED 0.551839464883
D1403_0_NUMERIZED 0.551011493803
D1441_0_NUMERIZED 0.550828948998
D960_0_NUMERIZED 0.550467119117
D1174_0_NUMERIZED 0.548889410445

In [40]:
selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
#selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089', 
#                            'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats


['D27_0_NUMERIZED', 'D1087_0_NUMERIZED', 'D1036_0_NUMERIZED', 'D1083_0_NUMERIZED', 'D1061_0_NUMERIZED', 'D1089_0_NUMERIZED', 'D954_0_NUMERIZED', 'D993_0_NUMERIZED', 'D959_0_NUMERIZED', 'D979_0_NUMERIZED']

In [249]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)


449

In [143]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:60])


Out[143]:
<munging.session.Session at 0x7fdb73cdb250>

In [144]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[60:260])


Out[144]:
<munging.session.Session at 0x7fdb73cdb250>