In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [60]:
from munging import session
from munging import transform
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
In [189]:
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")
data.head(3)
Out[189]:
In [238]:
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()
In [239]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)
In [240]:
## Know what you are dealing with
pd.value_counts(data.Activity)
Out[240]:
In [241]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats
In [242]:
## no need to remove heavy missing value features
In [243]:
## no need to impute missing values
In [244]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)
In [245]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))
In [246]:
## whiten features
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler
whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)
In [247]:
## find mutual redundante features
dsession.find_redundant_features()
Out[247]:
In [218]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)
In [219]:
## find redudant features as the numerization of categorical may bring some
dsession.find_redundant_features()
Out[219]:
In [224]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features,
by = dsession.numerized_feature_auc_metric,
target_value = 0)
for f, s in numerized_features_rank:
print f, s
if s <= 0.55: break
In [248]:
#selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089',
'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats
In [249]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
if f not in numerized_features]
print len(original_numerical_feats)
In [143]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:60])
Out[143]:
In [144]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[60:260])
Out[144]: