notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
data = pd.read_csv('data/train.csv')



In [3]:

    
data.describe()









    Out[3]:






  
    
      
      Activity
      D1
      D2
      D3
      D4
      D5
      D6
      D7
      D8
      D9
      ...
      D1767
      D1768
      D1769
      D1770
      D1771
      D1772
      D1773
      D1774
      D1775
      D1776
    
  
  
    
      count
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      ...
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
    
    
      mean
      0.542255
      0.076948
      0.592436
      0.068142
      0.038990
      0.212112
      0.686653
      0.274713
      0.455133
      0.749517
      ...
      0.026926
      0.014663
      0.013863
      0.021861
      0.015196
      0.016796
      0.012263
      0.011730
      0.020261
      0.011197
    
    
      std
      0.498278
      0.079989
      0.105860
      0.078414
      0.115885
      0.102592
      0.078702
      0.090017
      0.162731
      0.071702
      ...
      0.161889
      0.120215
      0.116938
      0.146249
      0.122348
      0.128522
      0.110074
      0.107683
      0.140911
      0.105236
    
    
      min
      0.000000
      0.000000
      0.282128
      0.000000
      0.000000
      0.002630
      0.137873
      0.006130
      0.000000
      0.275590
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.033300
      0.517811
      0.000000
      0.000000
      0.138118
      0.625627
      0.207374
      0.378062
      0.707339
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      1.000000
      0.066700
      0.585989
      0.050000
      0.000000
      0.190926
      0.674037
      0.277845
      0.499942
      0.738961
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      1.000000
      0.100000
      0.668395
      0.100000
      0.000000
      0.261726
      0.740663
      0.335816
      0.569962
      0.788177
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      0.964381
      0.950000
      1.000000
      1.000000
      0.994735
      0.790831
      0.989870
      1.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 1777 columns



In [21]:

    
y = data.as_matrix(['Activity'])
x = data.as_matrix(['D%i' % d for d in range(1,1777)])
print(y)
print(x)









    



[[1]
 [1]
 [1]
 ..., 
 [0]
 [1]
 [0]]
[[ 0.          0.49700901  0.1        ...,  0.          0.          0.        ]
 [ 0.36666667  0.60629148  0.05       ...,  0.          1.          0.        ]
 [ 0.0333      0.48012427  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.2         0.52056377  0.         ...,  0.          0.          0.        ]
 [ 0.1         0.7656462   0.         ...,  0.          0.          0.        ]
 [ 0.13333333  0.53395198  0.         ...,  0.          0.          0.        ]]



In [22]:

    
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf = clf.fit(x, y)









    



/Users/yoshi/.pyenv/versions/anaconda3-2.3.0/lib/python3.4/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  app.launch_new_instance()



In [23]:

    
test_data = pd.read_csv('data/test.csv')
test_data = test_data.as_matrix()
result = clf.predict(test_data)



In [28]:

    
def writeout(y):
    with open('./result.csv', 'w') as f:
        f.write('MoleculeId,PredictedProbability\n')
        for index, verdict in enumerate(y):
            if verdict > 1:
                verdict = 1
            elif verdict < 0:
                verdict = 0
            f.write('%i,%f\n' % (index+1,verdict))



In [32]:

    
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=5)
gbr = gbr.fit(x, y)









    



/Users/yoshi/.pyenv/versions/anaconda3-2.3.0/lib/python3.4/site-packages/sklearn/utils/validation.py:449: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)



In [33]:

    
result_boost = gbr.predict(test_data)



In [34]:

    
writeout(result_boost)



In [ ]:

	Activity	D1	D2	D3	D4	D5	D6	D7	D8	D9	...	D1767	D1768	D1769	D1770	D1771	D1772	D1773	D1774	D1775	D1776
count	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	...	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000
mean	0.542255	0.076948	0.592436	0.068142	0.038990	0.212112	0.686653	0.274713	0.455133	0.749517	...	0.026926	0.014663	0.013863	0.021861	0.015196	0.016796	0.012263	0.011730	0.020261	0.011197
std	0.498278	0.079989	0.105860	0.078414	0.115885	0.102592	0.078702	0.090017	0.162731	0.071702	...	0.161889	0.120215	0.116938	0.146249	0.122348	0.128522	0.110074	0.107683	0.140911	0.105236
min	0.000000	0.000000	0.282128	0.000000	0.000000	0.002630	0.137873	0.006130	0.000000	0.275590	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.033300	0.517811	0.000000	0.000000	0.138118	0.625627	0.207374	0.378062	0.707339	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.066700	0.585989	0.050000	0.000000	0.190926	0.674037	0.277845	0.499942	0.738961	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	1.000000	0.100000	0.668395	0.100000	0.000000	0.261726	0.740663	0.335816	0.569962	0.788177	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	0.964381	0.950000	1.000000	1.000000	0.994735	0.790831	0.989870	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000