In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('data/train.csv')

In [2]:
data.describe()


Out[2]:
Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
count 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 ... 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000
mean 0.542255 0.076948 0.592436 0.068142 0.038990 0.212112 0.686653 0.274713 0.455133 0.749517 ... 0.026926 0.014663 0.013863 0.021861 0.015196 0.016796 0.012263 0.011730 0.020261 0.011197
std 0.498278 0.079989 0.105860 0.078414 0.115885 0.102592 0.078702 0.090017 0.162731 0.071702 ... 0.161889 0.120215 0.116938 0.146249 0.122348 0.128522 0.110074 0.107683 0.140911 0.105236
min 0.000000 0.000000 0.282128 0.000000 0.000000 0.002630 0.137873 0.006130 0.000000 0.275590 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.033300 0.517811 0.000000 0.000000 0.138118 0.625627 0.207374 0.378062 0.707339 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.066700 0.585989 0.050000 0.000000 0.190926 0.674037 0.277845 0.499942 0.738961 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 1.000000 0.100000 0.668395 0.100000 0.000000 0.261726 0.740663 0.335816 0.569962 0.788177 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 0.964381 0.950000 1.000000 1.000000 0.994735 0.790831 0.989870 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 1777 columns


In [3]:
y = data.as_matrix(['Activity'])
y = [element[0] for element in y]
import columns as c_spec
# COL_NAMES = ['D%i' % (d+1) for d in (c_spec.mt50 + c_spec.bi)]
COL_NAMES = ['D%i' % (d+1) for d in c_spec.collective]
# x = data.as_matrix(['D%i' % d for d in range(1,1777)])
x = data.as_matrix(COL_NAMES)

In [4]:
def writeout(y, filename='./result.csv'):
    with open(filename, 'w') as f:
        f.write('MoleculeId,PredictedProbability\n')
        for index, verdict in enumerate(y):
            if verdict > 1:
                verdict = 1
            elif verdict < 0:
                verdict = 0
            f.write('%i,%f\n' % (index+1,verdict))

In [5]:
import scipy as sp
def llfun(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
rf = RandomForestClassifier(n_estimators=100)
cv = KFold(len(x), n_folds=5)
result_rf = []
for traincv, testcv in cv:
    xx = [x[i] for i in traincv]
    yy = [y[i] for i in traincv]
    rf.fit(xx, yy)
    xx = [x[i] for i in testcv]
    yy = [y[i] for i in testcv]
    probas = rf.predict_proba(xx)
    result_rf.append(llfun(yy, [el[1] for el in probas]) )

print(result_rf)


[0.46580941094379763, 0.47968132443929418, 0.46388064182428795]

In [7]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data.as_matrix(COL_NAMES)
result_forest = rf.predict_proba(test_data)
result_forest = [el[1] for el in result_forest]

In [8]:
writeout(result_forest, './result_forest.csv')

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=20)
gbr = gbr.fit(x, y)

In [10]:
result_boost = gbr.predict(test_data)

In [13]:
writeout(result_boost)

In [14]:
from sklearn.svm import SVC
svc = SVC(probability=True)
svc = svc.fit(x, y)
result_svc = svc.predict_proba(test_data)

In [15]:
result_svc = [r[1] for r in result_svc]
writeout(result_svc, filename='./result_svc.csv')

In [ ]:


In [16]:
# Ensemble
r_boost = gbr.predict(x)
r_svm = svc.predict(x)
r_forest = rf.predict(x)

In [17]:
# Train to predict from svm and random forest
x_en = []
for i in range(len(r_svm)):
#     x_en.append([r_boost[i], r_svm[i], r_forest[i]])
    x_en.append([r_svm[i], r_forest[i]])

In [21]:
# en_model = SVC(probability=True)
en_model = GradientBoostingRegressor(n_estimators=10)
# from sklearn.linear_model import LinearRegression
# en_model = LinearRegression()
en_model = en_model.fit(x_en, y)

In [23]:
result_combined = []
for i in range(len(result_forest)):
#     result_combined.append([result_boost[i], result_svc[i], result_forest[i]])
    result_combined.append([result_svc[i], result_forest[i]])
# result_ensemble = en_model.predict_proba(result_combined)
# result_ensemble = [el[1] for el in result_ensemble]
result_ensemble = en_model.predict(result_combined)

In [24]:
writeout(result_ensemble, filename='./ensemble.csv')

In [ ]:


In [ ]: