In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('data/train.csv')
In [3]:
data.describe()
Out[3]:
In [21]:
y = data.as_matrix(['Activity'])
x = data.as_matrix(['D%i' % d for d in range(1,1777)])
print(y)
print(x)
In [22]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf = clf.fit(x, y)
In [23]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data.as_matrix()
result = clf.predict(test_data)
In [28]:
def writeout(y):
with open('./result.csv', 'w') as f:
f.write('MoleculeId,PredictedProbability\n')
for index, verdict in enumerate(y):
if verdict > 1:
verdict = 1
elif verdict < 0:
verdict = 0
f.write('%i,%f\n' % (index+1,verdict))
In [32]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=5)
gbr = gbr.fit(x, y)
In [33]:
result_boost = gbr.predict(test_data)
In [34]:
writeout(result_boost)
In [ ]: