In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
In [2]:
import pandas as pd
Getting data from kaggle first :
In [3]:
import pkg_resources
In [4]:
raw_data = pd.read_csv(pkg_resources.resource_stream('deepforest', 'data/train.csv'))
In [5]:
clean_data = raw_data.drop(["Cabin", "Name", "PassengerId", "Ticket"], axis=1)
In [6]:
clean_data = pd.get_dummies(clean_data).fillna(-1)
In [7]:
train, test = train_test_split(clean_data)
In [8]:
def split_x_y(dataframe, target):
return dataframe.drop(target, axis=1), dataframe[target]
In [9]:
X_train, y_train = split_x_y(train, "Survived")
X_test, y_test = split_x_y(test, "Survived")
In [10]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)
Out[10]:
In [11]:
y_pred = rf.predict_proba(X_test)
In [12]:
auc = roc_auc_score(y_true=y_test, y_score=y_pred[:, 1])
In [13]:
auc
Out[13]:
In [14]:
from sklearn.model_selection import StratifiedKFold
In [15]:
rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4)
rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10)
In [16]:
rf1.fit(X_train, y_train)
y_pred_train_1 = rf1.predict_proba(X_train)
y_pred_test_1 = rf1.predict_proba(X_test)
y_pred_train_1 = pd.DataFrame(y_pred_train_1, columns=["rf1_0", "rf1_1"], index=X_train.index)
y_pred_test_1 = pd.DataFrame(y_pred_test_1, columns=["rf1_0", "rf1_1"], index=X_test.index)
In [17]:
rf2.fit(X_train, y_train)
y_pred_train_2 = rf2.predict_proba(X_train)
y_pred_test_2 = rf2.predict_proba(X_test)
y_pred_train_2 = pd.DataFrame(y_pred_train_2, columns=["rf2_0", "rf2_1"], index=X_train.index)
y_pred_test_2 = pd.DataFrame(y_pred_test_2, columns=["rf2_0", "rf2_1"], index=X_test.index)
In [18]:
hidden_train_1 = pd.concat([X_train, y_pred_train_1, y_pred_train_2], axis=1)
hidden_test_1 = pd.concat([X_test, y_pred_test_1, y_pred_test_2], axis=1)
In [19]:
hidden_train_1.head()
Out[19]:
In [20]:
rf3 = RandomForestClassifier(n_estimators=300, n_jobs=-1)
rf3.fit(hidden_train_1, y_train)
Out[20]:
In [21]:
y_pred3 = rf3.predict_proba(hidden_test_1)
In [22]:
roc_auc_score(y_test, y_pred3[:, 1])
Out[22]:
This is not very handy, not at all. We already see a lot of code duplication, and one may feel there may be a way to abstract a lot of the logic that is happening here, in a way that is more flexible and powerful that all this boilerplate code.
In [23]:
from deepforest.layer import Layer, InputLayer
In [24]:
input_layer = InputLayer(RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4),
RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10))
In [25]:
hidden_layer = Layer(input_layer,
RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=4),
RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=10))
In [26]:
hidden_layer.fit(X_train, y_train)
Out[26]:
In [27]:
pd.DataFrame(hidden_layer.predict(X_test), index=X_test.index)
Out[27]:
In [28]:
def random_forest_generator():
for i in range(2, 15, 2):
yield RandomForestClassifier(n_estimators=100,
n_jobs=-1,
min_samples_leaf=5,
max_depth=i)
for i in range(2, 15, 2):
yield RandomForestClassifier(n_estimators=100,
n_jobs=-1,
max_features=1,
min_samples_leaf=5,
max_depth=i)
In [29]:
def paper_like_generator():
for i in range(2):
yield RandomForestClassifier(n_estimators=1000,
n_jobs=-1,
min_samples_leaf=10)
for i in range(2):
yield RandomForestClassifier(n_estimators=1000,
n_jobs=-1,
max_features=1,
min_samples_leaf=10)
In [30]:
def build_input_layer():
return InputLayer(*paper_like_generator())
In [31]:
def build_hidden_layer(layer):
return Layer(layer, *paper_like_generator())
In [32]:
def build_output_layer(layer):
return Layer(layer,
RandomForestClassifier(n_estimators=500,
n_jobs=-1,
min_samples_leaf=5,
max_depth=10))
In [33]:
input_l = build_input_layer()
hidden_1 = build_hidden_layer(input_l)
hidden_2 = build_hidden_layer(hidden_1)
hidden_3 = build_hidden_layer(hidden_2)
hidden_4 = build_hidden_layer(hidden_3)
output_l = build_output_layer(hidden_4)
In [34]:
output_l.fit(X_train, y_train)
Out[34]:
In [35]:
y_pred = output_l.predict(X_test)
In [36]:
y_pred
Out[36]:
In [37]:
roc_auc_score(y_test, y_pred[:, 1])
Out[37]:
Well the result is not that satisfactory yet, but let's not loose hope. There is a lot of room for improvement yet. First item on my todo list: make sure all the intermediary models are trained using cross-validation techniques, to reduce overfitting.