In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('data/train.csv')
In [2]:
data.describe()
Out[2]:
In [3]:
y = data.as_matrix(['Activity'])
y = [element[0] for element in y]
import columns as c_spec
# COL_NAMES = ['D%i' % (d+1) for d in (c_spec.mt50 + c_spec.bi)]
COL_NAMES = ['D%i' % (d+1) for d in c_spec.collective]
# x = data.as_matrix(['D%i' % d for d in range(1,1777)])
x = data.as_matrix(COL_NAMES)
In [4]:
def writeout(y, filename='./result.csv'):
with open(filename, 'w') as f:
f.write('MoleculeId,PredictedProbability\n')
for index, verdict in enumerate(y):
if verdict > 1:
verdict = 1
elif verdict < 0:
verdict = 0
f.write('%i,%f\n' % (index+1,verdict))
In [5]:
import scipy as sp
def llfun(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
ll = ll * -1.0/len(act)
return ll
In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
rf = RandomForestClassifier(n_estimators=100)
cv = KFold(len(x), n_folds=5)
result_rf = []
for traincv, testcv in cv:
xx = [x[i] for i in traincv]
yy = [y[i] for i in traincv]
rf.fit(xx, yy)
xx = [x[i] for i in testcv]
yy = [y[i] for i in testcv]
probas = rf.predict_proba(xx)
result_rf.append(llfun(yy, [el[1] for el in probas]) )
print(result_rf)
In [7]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data.as_matrix(COL_NAMES)
result_forest = rf.predict_proba(test_data)
result_forest = [el[1] for el in result_forest]
In [8]:
writeout(result_forest, './result_forest.csv')
In [9]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=20)
gbr = gbr.fit(x, y)
In [10]:
result_boost = gbr.predict(test_data)
In [13]:
writeout(result_boost)
In [14]:
from sklearn.svm import SVC
svc = SVC(probability=True)
svc = svc.fit(x, y)
result_svc = svc.predict_proba(test_data)
In [15]:
result_svc = [r[1] for r in result_svc]
writeout(result_svc, filename='./result_svc.csv')
In [ ]:
In [16]:
# Ensemble
r_boost = gbr.predict(x)
r_svm = svc.predict(x)
r_forest = rf.predict(x)
In [17]:
# Train to predict from svm and random forest
x_en = []
for i in range(len(r_svm)):
# x_en.append([r_boost[i], r_svm[i], r_forest[i]])
x_en.append([r_svm[i], r_forest[i]])
In [21]:
# en_model = SVC(probability=True)
en_model = GradientBoostingRegressor(n_estimators=10)
# from sklearn.linear_model import LinearRegression
# en_model = LinearRegression()
en_model = en_model.fit(x_en, y)
In [23]:
result_combined = []
for i in range(len(result_forest)):
# result_combined.append([result_boost[i], result_svc[i], result_forest[i]])
result_combined.append([result_svc[i], result_forest[i]])
# result_ensemble = en_model.predict_proba(result_combined)
# result_ensemble = [el[1] for el in result_ensemble]
result_ensemble = en_model.predict(result_combined)
In [24]:
writeout(result_ensemble, filename='./ensemble.csv')
In [ ]:
In [ ]: