notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
data = pd.read_csv('data/train.csv')



In [2]:

    
data.describe()









    Out[2]:






  
    
      
      Activity
      D1
      D2
      D3
      D4
      D5
      D6
      D7
      D8
      D9
      ...
      D1767
      D1768
      D1769
      D1770
      D1771
      D1772
      D1773
      D1774
      D1775
      D1776
    
  
  
    
      count
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      ...
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
      3751.000000
    
    
      mean
      0.542255
      0.076948
      0.592436
      0.068142
      0.038990
      0.212112
      0.686653
      0.274713
      0.455133
      0.749517
      ...
      0.026926
      0.014663
      0.013863
      0.021861
      0.015196
      0.016796
      0.012263
      0.011730
      0.020261
      0.011197
    
    
      std
      0.498278
      0.079989
      0.105860
      0.078414
      0.115885
      0.102592
      0.078702
      0.090017
      0.162731
      0.071702
      ...
      0.161889
      0.120215
      0.116938
      0.146249
      0.122348
      0.128522
      0.110074
      0.107683
      0.140911
      0.105236
    
    
      min
      0.000000
      0.000000
      0.282128
      0.000000
      0.000000
      0.002630
      0.137873
      0.006130
      0.000000
      0.275590
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.033300
      0.517811
      0.000000
      0.000000
      0.138118
      0.625627
      0.207374
      0.378062
      0.707339
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      1.000000
      0.066700
      0.585989
      0.050000
      0.000000
      0.190926
      0.674037
      0.277845
      0.499942
      0.738961
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      1.000000
      0.100000
      0.668395
      0.100000
      0.000000
      0.261726
      0.740663
      0.335816
      0.569962
      0.788177
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      0.964381
      0.950000
      1.000000
      1.000000
      0.994735
      0.790831
      0.989870
      1.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 1777 columns



In [3]:

    
y = data.as_matrix(['Activity'])
y = [element[0] for element in y]
import columns as c_spec
# COL_NAMES = ['D%i' % (d+1) for d in (c_spec.mt50 + c_spec.bi)]
COL_NAMES = ['D%i' % (d+1) for d in c_spec.collective]
# x = data.as_matrix(['D%i' % d for d in range(1,1777)])
x = data.as_matrix(COL_NAMES)



In [4]:

    
def writeout(y, filename='./result.csv'):
    with open(filename, 'w') as f:
        f.write('MoleculeId,PredictedProbability\n')
        for index, verdict in enumerate(y):
            if verdict > 1:
                verdict = 1
            elif verdict < 0:
                verdict = 0
            f.write('%i,%f\n' % (index+1,verdict))



In [5]:

    
import scipy as sp
def llfun(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll



In [6]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
rf = RandomForestClassifier(n_estimators=100)
cv = KFold(len(x), n_folds=5)
result_rf = []
for traincv, testcv in cv:
    xx = [x[i] for i in traincv]
    yy = [y[i] for i in traincv]
    rf.fit(xx, yy)
    xx = [x[i] for i in testcv]
    yy = [y[i] for i in testcv]
    probas = rf.predict_proba(xx)
    result_rf.append(llfun(yy, [el[1] for el in probas]) )

print(result_rf)









    



[0.46580941094379763, 0.47968132443929418, 0.46388064182428795]



In [7]:

    
test_data = pd.read_csv('data/test.csv')
test_data = test_data.as_matrix(COL_NAMES)
result_forest = rf.predict_proba(test_data)
result_forest = [el[1] for el in result_forest]



In [8]:

    
writeout(result_forest, './result_forest.csv')



In [9]:

    
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=20)
gbr = gbr.fit(x, y)



In [10]:

    
result_boost = gbr.predict(test_data)



In [13]:

    
writeout(result_boost)



In [14]:

    
from sklearn.svm import SVC
svc = SVC(probability=True)
svc = svc.fit(x, y)
result_svc = svc.predict_proba(test_data)



In [15]:

    
result_svc = [r[1] for r in result_svc]
writeout(result_svc, filename='./result_svc.csv')



In [ ]:



In [16]:

    
# Ensemble
r_boost = gbr.predict(x)
r_svm = svc.predict(x)
r_forest = rf.predict(x)



In [17]:

    
# Train to predict from svm and random forest
x_en = []
for i in range(len(r_svm)):
#     x_en.append([r_boost[i], r_svm[i], r_forest[i]])
    x_en.append([r_svm[i], r_forest[i]])



In [21]:

    
# en_model = SVC(probability=True)
en_model = GradientBoostingRegressor(n_estimators=10)
# from sklearn.linear_model import LinearRegression
# en_model = LinearRegression()
en_model = en_model.fit(x_en, y)



In [23]:

    
result_combined = []
for i in range(len(result_forest)):
#     result_combined.append([result_boost[i], result_svc[i], result_forest[i]])
    result_combined.append([result_svc[i], result_forest[i]])
# result_ensemble = en_model.predict_proba(result_combined)
# result_ensemble = [el[1] for el in result_ensemble]
result_ensemble = en_model.predict(result_combined)



In [24]:

    
writeout(result_ensemble, filename='./ensemble.csv')



In [ ]:



In [ ]:

	Activity	D1	D2	D3	D4	D5	D6	D7	D8	D9	...	D1767	D1768	D1769	D1770	D1771	D1772	D1773	D1774	D1775	D1776
count	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	...	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000	3751.000000
mean	0.542255	0.076948	0.592436	0.068142	0.038990	0.212112	0.686653	0.274713	0.455133	0.749517	...	0.026926	0.014663	0.013863	0.021861	0.015196	0.016796	0.012263	0.011730	0.020261	0.011197
std	0.498278	0.079989	0.105860	0.078414	0.115885	0.102592	0.078702	0.090017	0.162731	0.071702	...	0.161889	0.120215	0.116938	0.146249	0.122348	0.128522	0.110074	0.107683	0.140911	0.105236
min	0.000000	0.000000	0.282128	0.000000	0.000000	0.002630	0.137873	0.006130	0.000000	0.275590	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.033300	0.517811	0.000000	0.000000	0.138118	0.625627	0.207374	0.378062	0.707339	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.066700	0.585989	0.050000	0.000000	0.190926	0.674037	0.277845	0.499942	0.738961	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	1.000000	0.100000	0.668395	0.100000	0.000000	0.261726	0.740663	0.335816	0.569962	0.788177	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	0.964381	0.950000	1.000000	1.000000	0.994735	0.790831	0.989870	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000