In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('data/train.csv')

In [3]:
data.describe()


Out[3]:
Activity D1 D2 D3 D4 D5 D6 D7 D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 D1774 D1775 D1776
count 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 ... 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000 3751.000000
mean 0.542255 0.076948 0.592436 0.068142 0.038990 0.212112 0.686653 0.274713 0.455133 0.749517 ... 0.026926 0.014663 0.013863 0.021861 0.015196 0.016796 0.012263 0.011730 0.020261 0.011197
std 0.498278 0.079989 0.105860 0.078414 0.115885 0.102592 0.078702 0.090017 0.162731 0.071702 ... 0.161889 0.120215 0.116938 0.146249 0.122348 0.128522 0.110074 0.107683 0.140911 0.105236
min 0.000000 0.000000 0.282128 0.000000 0.000000 0.002630 0.137873 0.006130 0.000000 0.275590 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.033300 0.517811 0.000000 0.000000 0.138118 0.625627 0.207374 0.378062 0.707339 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.066700 0.585989 0.050000 0.000000 0.190926 0.674037 0.277845 0.499942 0.738961 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 1.000000 0.100000 0.668395 0.100000 0.000000 0.261726 0.740663 0.335816 0.569962 0.788177 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 0.964381 0.950000 1.000000 1.000000 0.994735 0.790831 0.989870 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 1777 columns


In [21]:
y = data.as_matrix(['Activity'])
x = data.as_matrix(['D%i' % d for d in range(1,1777)])
print(y)
print(x)


[[1]
 [1]
 [1]
 ..., 
 [0]
 [1]
 [0]]
[[ 0.          0.49700901  0.1        ...,  0.          0.          0.        ]
 [ 0.36666667  0.60629148  0.05       ...,  0.          1.          0.        ]
 [ 0.0333      0.48012427  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.2         0.52056377  0.         ...,  0.          0.          0.        ]
 [ 0.1         0.7656462   0.         ...,  0.          0.          0.        ]
 [ 0.13333333  0.53395198  0.         ...,  0.          0.          0.        ]]

In [22]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf = clf.fit(x, y)


/Users/yoshi/.pyenv/versions/anaconda3-2.3.0/lib/python3.4/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  app.launch_new_instance()

In [23]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data.as_matrix()
result = clf.predict(test_data)

In [28]:
def writeout(y):
    with open('./result.csv', 'w') as f:
        f.write('MoleculeId,PredictedProbability\n')
        for index, verdict in enumerate(y):
            if verdict > 1:
                verdict = 1
            elif verdict < 0:
                verdict = 0
            f.write('%i,%f\n' % (index+1,verdict))

In [32]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=5)
gbr = gbr.fit(x, y)


/Users/yoshi/.pyenv/versions/anaconda3-2.3.0/lib/python3.4/site-packages/sklearn/utils/validation.py:449: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [33]:
result_boost = gbr.predict(test_data)

In [34]:
writeout(result_boost)

In [ ]: