Resources

  1. main page
  2. solutions

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [60]:
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [189]:
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")
data.head(3)


Out[189]:
Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 1 0.000000 0.497009 0.10 0 0.132956 0.678031 0.273166 0.585445 0.743663 ... 0 0 0 0 0 0 0 0 0 0
1 1 0.366667 0.606291 0.05 0 0.111209 0.803455 0.106105 0.411754 0.836582 ... 1 1 1 1 0 1 0 0 1 0
2 1 0.033300 0.480124 0.00 0 0.209791 0.610350 0.356453 0.517720 0.679051 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1777 columns


In [238]:
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()


{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}

In [239]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)


942
834

In [240]:
## Know what you are dealing with
pd.value_counts(data.Activity)


Out[240]:
1    2034
0    1717
dtype: int64

In [241]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats


[]

In [242]:
## no need to remove heavy missing value features

In [243]:
## no need to impute missing values

In [244]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)


833

In [245]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))


396
380

In [246]:
## whiten features 
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler

whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)

In [247]:
## find mutual redundante features
dsession.find_redundant_features()


Out[247]:
[]

In [218]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)

In [219]:
## find redudant features as the numerization of categorical may bring some
dsession.find_redundant_features()


Out[219]:
[]

In [224]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features, 
                                                 by = dsession.numerized_feature_auc_metric, 
                                                 target_value = 0)
for f, s in numerized_features_rank:
    print f, s
    if s <= 0.55: break


D27_0_NUMERIZED 0.745680221906
D1036_0_NUMERIZED 0.614941724349
D1004_0_NUMERIZED 0.599488497557
D979_0_NUMERIZED 0.587629784202
D1089_0_NUMERIZED 0.58216803237
D1109_0_NUMERIZED 0.579139734324
D1125_0_NUMERIZED 0.576009644748
D1061_0_NUMERIZED 0.573824307818
D954_0_NUMERIZED 0.571689866653
D1176_0_NUMERIZED 0.5716580568
D1087_0_NUMERIZED 0.570242518322
D1133_0_NUMERIZED 0.56965721702
D1143_0_NUMERIZED 0.568833341816
D993_0_NUMERIZED 0.567907675081
D1169_0_NUMERIZED 0.567796340594
D1168_0_NUMERIZED 0.567500508958
D996_0_NUMERIZED 0.566924750611
D1002_0_NUMERIZED 0.566698900651
D1106_0_NUMERIZED 0.564071406759
D1160_0_NUMERIZED 0.563944167345
D981_0_NUMERIZED 0.563606982899
D959_0_NUMERIZED 0.563400218852
D1083_0_NUMERIZED 0.562309140879
D1066_0_NUMERIZED 0.561809726181
D1281_0_NUMERIZED 0.561580695236
D1164_0_NUMERIZED 0.561478903705
D1163_0_NUMERIZED 0.560985850977
D1196_0_NUMERIZED 0.560467350366
D1180_0_NUMERIZED 0.559159965391
D1309_0_NUMERIZED 0.557184573493
D978_0_NUMERIZED 0.555788120928
D967_0_NUMERIZED 0.555050132329
D961_0_NUMERIZED 0.554741576751
D1403_0_NUMERIZED 0.554452107085
D1041_0_NUMERIZED 0.553828633958
D1059_0_NUMERIZED 0.553520078379
D1190_0_NUMERIZED 0.552756641897
D964_0_NUMERIZED 0.552559420806
D1285_0_NUMERIZED 0.552336751832
D51_0_NUMERIZED 0.551691011808
D1338_0_NUMERIZED 0.549063517915

In [248]:
#selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089', 
                            'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats


['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176']

In [249]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
                                if f not in numerized_features]
print len(original_numerical_feats)


449

In [143]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:60])


Out[143]:
<munging.session.Session at 0x7fdb73cdb250>

In [144]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[60:260])


Out[144]:
<munging.session.Session at 0x7fdb73cdb250>