In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import math

In [12]:
data = pd.read_csv('resources/gbm-data.csv')
data.head(10)


Out[12]:
Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
0 1 0.000000 0.497009 0.10 0.00 0.132956 0.678031 0.273166 0.585445 0.743663 ... 0 0 0 0 0 0 0 0 0 0
1 1 0.366667 0.606291 0.05 0.00 0.111209 0.803455 0.106105 0.411754 0.836582 ... 1 1 1 1 0 1 0 0 1 0
2 1 0.033300 0.480124 0.00 0.00 0.209791 0.610350 0.356453 0.517720 0.679051 ... 0 0 0 0 0 0 0 0 0 0
3 1 0.000000 0.538825 0.00 0.50 0.196344 0.724230 0.235606 0.288764 0.805110 ... 0 0 0 0 0 0 0 0 0 0
4 0 0.100000 0.517794 0.00 0.00 0.494734 0.781422 0.154361 0.303809 0.812646 ... 0 0 0 0 0 0 0 0 0 0
5 0 0.133333 0.771035 0.20 0.25 0.122153 0.677398 0.267224 0.611112 0.701421 ... 0 0 0 0 0 0 0 0 0 0
6 1 0.066700 0.567401 0.10 0.00 0.116578 0.689802 0.274886 0.548509 0.785429 ... 0 0 0 0 0 0 0 0 0 0
7 1 0.133333 0.610057 0.15 0.00 0.105176 0.759602 0.313393 0.491790 0.756010 ... 0 0 1 0 1 0 0 0 0 0
8 1 0.000000 0.776816 0.05 0.00 0.458446 0.738631 0.246347 0.067800 0.751861 ... 0 0 0 0 0 0 0 0 0 0
9 0 0.100000 0.768150 0.10 0.25 0.262299 0.598972 0.377065 0.394514 0.673797 ... 0 0 0 0 0 0 0 0 0 0

10 rows × 1777 columns


In [16]:
X = data.loc[:, "D1":].values
y = data['Activity'].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state=241)

In [22]:
gbc = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241)

In [27]:
def sigmoid(y_pred):
    return 1.0 / (1.0 + math.exp(-y_pred))


def log_loss_results(model, X, y):
    results = []
    for pred in model.staged_decision_function(X):
        results.append(log_loss(y, [sigmoid(y_pred) for y_pred in pred]))

    return results


def plot_loss(learning_rate, test_loss, train_loss):
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.savefig('plots/rate_' + str(learning_rate) + '.png')

    min_loss_value = min(test_loss)
    min_loss_index = test_loss.index(min_loss_value)
    return min_loss_value, min_loss_index


def test_model(rate):
    model = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250, verbose=True, random_state=241)
    model.fit(X_train, y_train)

    train_loss = log_loss_results(model, X_train, y_train)
    test_loss = log_loss_results(model, X_test, y_test)
    return plot_loss(learning_rate, test_loss, train_loss)


min_loss_results = {}
for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
    min_loss_results[learning_rate] = test_model(learning_rate)


      Iter       Train Loss   Remaining Time 
         1           1.0190           16.20s
         2           0.9192           16.31s
         3           0.8272           16.01s
         4           0.7834           16.15s
         5           0.7109           16.24s
         6           0.6368           16.30s
         7           0.5797           16.51s
         8           0.5610           16.39s
         9           0.5185           16.26s
        10           0.4984           15.94s
        20           0.1999           14.87s
        30           0.1313           13.86s
        40           0.0790           13.74s
        50           0.0511           13.34s
        60           0.0352           12.60s
        70           0.0245           11.81s
        80           0.0162           11.18s
        90           0.0114           10.60s
       100           0.0077            9.94s
       200           0.0004            2.94s
      Iter       Train Loss   Remaining Time 
         1           1.1255           18.40s
         2           1.0035           18.29s
         3           0.9386           18.28s
         4           0.8844           17.69s
         5           0.8381           17.37s
         6           0.7995           17.03s
         7           0.7559           16.98s
         8           0.7205           16.65s
         9           0.6958           16.37s
        10           0.6725           16.16s
        20           0.4672           15.90s
        30           0.3179           14.98s
        40           0.2274           14.23s
        50           0.1774           13.32s
        60           0.1394           12.51s
        70           0.1050           11.81s
        80           0.0805           11.10s
        90           0.0650           10.43s
       100           0.0511            9.87s
       200           0.0058            3.28s
      Iter       Train Loss   Remaining Time 
         1           1.2095           17.74s
         2           1.1006           17.64s
         3           1.0240           17.76s
         4           0.9729           17.64s
         5           0.9387           17.33s
         6           0.8948           17.46s
         7           0.8621           17.19s
         8           0.8360           17.03s
         9           0.8171           16.78s
        10           0.7883           16.70s
        20           0.6164           15.47s
        30           0.4933           14.45s
        40           0.4248           14.02s
        50           0.3345           13.53s
        60           0.2760           12.94s
        70           0.2263           12.35s
        80           0.1971           11.62s
        90           0.1693           10.91s
       100           0.1388           10.17s
       200           0.0294            3.24s
      Iter       Train Loss   Remaining Time 
         1           1.2613           18.23s
         2           1.1715           19.29s
         3           1.1009           18.95s
         4           1.0529           18.64s
         5           1.0130           18.45s
         6           0.9740           18.28s
         7           0.9475           17.77s
         8           0.9197           17.64s
         9           0.8979           17.24s
        10           0.8730           17.10s
        20           0.7207           15.85s
        30           0.6055           15.25s
        40           0.5244           14.57s
        50           0.4501           14.35s
        60           0.3908           13.81s
        70           0.3372           13.09s
        80           0.3009           12.19s
        90           0.2603           11.56s
       100           0.2327           10.74s
       200           0.0835            3.44s
      Iter       Train Loss   Remaining Time 
         1           1.3199           23.98s
         2           1.2645           20.77s
         3           1.2170           19.96s
         4           1.1775           19.40s
         5           1.1404           18.96s
         6           1.1106           18.77s
         7           1.0844           18.61s
         8           1.0617           18.37s
         9           1.0411           18.30s
        10           1.0223           18.09s
        20           0.8864           16.62s
        30           0.7844           15.66s
        40           0.7176           14.53s
        50           0.6590           13.75s
        60           0.6120           12.92s
        70           0.5599           12.23s
        80           0.5242           11.48s
        90           0.4829           10.78s
       100           0.4473           10.02s
       200           0.2379            3.39s

In [29]:
min_loss_value, min_loss_index = min_loss_results[0.2]

In [34]:
model = RandomForestClassifier(n_estimators=min_loss_index, random_state=241)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
test_loss = log_loss(y_test, y_pred)
print(test_loss)


0.5413812861804069