In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import datetime

In [2]:
def now():
    return '_'.join(str(datetime.datetime.now()).split())

In [3]:
mapping = {'nn_garf1_': ('2015-04-13_21:39:17.310471.csv', 0.46976),
           'nn_katz1_': ('2015-04-29_12:08:50.306024.csv', 0.47205),
           'nnlasagna3_': ('2015-04-05_16:54:09.136787.csv', 0.47092),
           'cb_rf5_': ('2015-04-26_05:25:53.739000.csv', 0.46996),
           'nn_dbn2_': ('2015-04-07_23:17:13.445133.csv', 0.47606),
           'gb_lr05_4_': ('2015-04-13_00:37:50.424466.csv', 0.46114),
           'xgb1_': ('2015-04-26_14:08:54.748001.csv', 0.44231),
           'xgb3_': ('2015-04-28_16:32:26.187556.csv', 0.43306),
           'garf_sum': ('2015-05-06_10:43:18.827734.csv', 0.46414),
           'katz_sum': ('2015-05-06_10:50:46.827548.csv', 0.46282),
           'lasagna_sum': ('2015-05-05_06:55:53.636235.csv',  0.46267),
           'dbn_sum': ('2015-05-06_16:13:10.400575.csv', 0.0),
           'kitty_sum': ('2015-05-09_14:48:49.741998.csv', 0.45835)}
# this is just to understand which submissions I'm averaging

In [18]:
to_avg = ('garf_sum', 'katz_sum', 'lasagna_sum', 'cb_rf5_', 'dbn_sum', 'gb_lr05_4_', 'xgb1_', 'xgb3_')
to_avg = ('dbn_sum', 'katz_sum', 'lasagna_sum', 'cb_rf5_', 'gb_lr05_4_', 'xgb1_', 'kitty_sum', 'xgb3_')

# 'katzs1_', 'lass1_', 'cb_rf5_', 'gb_lr05_4_', 'xgb1_', 'kittys1_', 'xgb3_'
# 0.08
# 0.17
# 0.1
# 0.11
# 0.06
# 0.08
# 0.4
w = (0.03, 0.07, 0.16, 0.1, 0.1, 0.06, 0.08, 0.4)  # the weights

print(sum(w))
names = [('submissions/' + mapping[n][0], w) for n, w in zip(to_avg, w)]


1.0

In [19]:
all_pds = [pd.read_csv(name[0]) for name in names]

In [20]:
test_size = all_pds[0]['id'].values.shape[0]

In [21]:
predict_dict = {'id': all_pds[0]['id'].values}
for i in range(9):
    predict_dict['Class_' + str(i + 1)] = np.zeros(test_size)
    for j, name in enumerate(names):
        predict_dict['Class_' + str(i + 1)] += all_pds[j]['Class_' + str(i + 1)].values * name[1]

In [22]:
df_sub = pd.DataFrame(predict_dict)

In [23]:
df_sub.columns


Out[23]:
Index(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9', 'id'], dtype='object')

In [24]:
fname = 'submissions/' + now() + '.csv'
df_sub.to_csv(fname, index=False)

In [ ]: