In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score, StratifiedShuffleSplit
from sklearn.metrics import log_loss

In [2]:
folder_id = 1
fff = 'tmpsubs/' + str(folder_id) + '/'

In [3]:
def convert_class_label_to_int(class_label):
    return int(class_label[6:]) - 1

In [6]:
base_names = ['nn_garf1_', 'nn_katz1_', 'nnlasagna3_', 'cb_rf5_',
              'nn_dbn2_', 'gb_lr05_4_', 'xgb1_', 'xgb3_'] 
base_names = ['garfs1_', 'katzs1_', 'lass1_', 'cb_rf5_',
              'nn_dbn2_', 'gb_lr05_4_', 'xgb1_', 'kittys1_', 'xgb3_'] 
# these are the prefixes of the files to be loaded, there are
# 3 for each classifier


tot_subs = len(base_names)
data_names = [[], [], []]
data_names[0] = [fff + name + '1.csv' for name in base_names]
data_names[1] = [fff + name + '2.csv' for name in base_names]
data_names[2] = [fff + name + '3.csv' for name in base_names]

w_shape = [7] * (tot_subs - 1)  # the weights of each submission (except the last one) go from 0.0 to 0.35
w_shape = [5, 6, 5, 5, 5, 5, 7, 7]  # or we can manually specify the max weight of each one

w_shape.append(3)  # we're using 3-fold CV

start_vals = [0.] * (tot_subs - 1)

step_size = 0.05
weight_results_array = np.ones(w_shape)  # we store the results in this array

In [7]:
data_dfs = []
for data_fold in data_names:
    tmp = []
    for i in data_fold:
        tmp.append(pd.read_csv(i))
    data_dfs.append(tuple(tmp))

In [8]:
df_train = pd.read_csv('raw_data/train.csv')
renamed_labels = [convert_class_label_to_int(i) for i in df_train['target'].values]
df_train['renamed_labels'] = renamed_labels

In [9]:
curr_id = 0
sss = StratifiedShuffleSplit(df_train['renamed_labels'], n_iter=3, test_size=0.2, random_state=42)
for train_index, test_index in sss:
    print(curr_id)
    it = np.nditer(weight_results_array[..., curr_id], flags=['multi_index'], op_flags=['writeonly'])
    labels_test = df_train['renamed_labels'].values[test_index]
    while not it.finished:
        weights = [start_vals[mi[0]] + mi[1] * step_size for mi in enumerate(it.multi_index)]
        if sum(weights) <= 1.:
#             weights.append(1. - sum(weights))
            pred = data_dfs[curr_id][0].values * weights[0]
            for di, w in enumerate(weights[1:]):
                pred += w * data_dfs[curr_id][di+1].values
            pred += (1. - sum(weights)) * data_dfs[curr_id][-1].values
            it[0] = log_loss(labels_test, pred)
        it.iternext()
    curr_id += 1


0
1
2

In [10]:
summed_w = np.sum(weight_results_array, axis=tot_subs-1) / 3
print(summed_w.shape)


(5, 6, 5, 5, 5, 5, 7, 7)

In [11]:
print(np.min(summed_w))
min_arg = np.argmin(summed_w)
min_point = np.unravel_index(min_arg, summed_w.shape)
print(np.min(weight_results_array[...,0]), np.min(weight_results_array[...,1]), np.min(weight_results_array[...,2]))


0.430940155762
0.427100601626 0.440500659128 0.424503060685

In [13]:
print(min_point)


(0, 2, 3, 2, 0, 2, 1, 2)

In [16]:
min_point = (0, 2, 3, 2, 0, 2, 1, 2)
step_size = 0.05
for j in range(tot_subs-1):
    start_vals[j] = max(0., min_point[j] * step_size - 0.03)
weights = [0.] * (tot_subs - 1)
step_size = 0.01
print(start_vals)
w_shape = [7] * (tot_subs - 1)
w_shape.append(3)
weight_results_array = np.ones(w_shape)


[0.0, 0.07, 0.12000000000000002, 0.07, 0.0, 0.07, 0.020000000000000004, 0.07]

In [17]:
curr_id = 0
sss = StratifiedShuffleSplit(df_train['renamed_labels'], n_iter=3, test_size=0.2, random_state=42)
for train_index, test_index in sss:
    print(curr_id)
    it = np.nditer(weight_results_array[..., curr_id], flags=['multi_index'], op_flags=['writeonly'])
    labels_test = df_train['renamed_labels'].values[test_index]
    while not it.finished:
        weights = [start_vals[mi[0]] + mi[1] * step_size for mi in enumerate(it.multi_index)]
        if sum(weights) <= 1.:
#             weights.append(1. - sum(weights))
            pred = data_dfs[curr_id][0].values * weights[0]
            for di, w in enumerate(weights[1:]):
                pred += w * data_dfs[curr_id][di+1].values
            pred += (1. - sum(weights)) * data_dfs[curr_id][-1].values
            it[0] = log_loss(labels_test, pred)
        it.iternext()
    curr_id += 1


0
1
2

In [18]:
summed_w = np.sum(weight_results_array, axis=tot_subs-1) / 3
print(summed_w.shape)


(7, 7, 7, 7, 7, 7, 7, 7)

In [20]:
print(np.min(summed_w))
min_arg = np.argmin(summed_w)
min_point = np.unravel_index(min_arg, summed_w.shape)
last_w = 0.
for j in range(tot_subs-1):
    last_w += start_vals[j] + min_point[j] * step_size
    print(start_vals[j] + min_point[j] * step_size)
print(1. - last_w)


0.43090004549
0.0
0.08
0.17
0.1
0.0
0.11
0.06
0.08
0.4

In [ ]: