In [1]:
import numpy as np
import pandas as pd
import os,sys
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

from scipy import stats as stats

In [2]:
# load lb, test and CV CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]

In [3]:
# load morgan matrix to use them as weights in training
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/morgan_sim.csv"), index_col=0)
weights = morgan[morgan.index.astype(str)]
print weights.shape
weights.head()


(476, 476)
Out[3]:
126 176 177 180 196 239 240 241 243 244 ... 5366244 5367698 5367706 5368076 5371102 6114390 6429333 6999977 10857465 16220109
0
126 1.000000 0.108108 0.171429 0.054054 0.066667 0.090909 0.509091 0.166667 0.315789 0.290909 ... 0.033613 0.183673 0.195652 0.051948 0.252632 0.237288 0.307692 0.066667 0.000000 0.050633
176 0.108108 1.000000 0.285714 0.625000 0.256410 0.434783 0.058824 0.000000 0.277778 0.058824 ... 0.081633 0.103896 0.112676 0.142857 0.135135 0.082474 0.136364 0.256410 0.027397 0.172414
177 0.171429 0.285714 1.000000 0.285714 0.054054 0.095238 0.187500 0.000000 0.058824 0.000000 ... 0.041667 0.080000 0.086957 0.111111 0.083333 0.084211 0.238095 0.108108 0.028169 0.142857
180 0.054054 0.625000 0.285714 1.000000 0.153846 0.260870 0.058824 0.000000 0.166667 0.000000 ... 0.081633 0.103896 0.112676 0.178571 0.108108 0.103093 0.136364 0.256410 0.054795 0.206897
196 0.066667 0.256410 0.054054 0.153846 1.000000 0.434783 0.035088 0.000000 0.169492 0.070175 ... 0.148760 0.140000 0.127660 0.227848 0.144330 0.066667 0.059701 0.258065 0.000000 0.172840

5 rows × 476 columns


In [4]:
#load the features
features = pd.read_csv('features.csv')
features.head()


Out[4]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91305518_2 91411526_2 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.000013 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080
1 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.000124 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162
2 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000014 0.000092 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932
3 180 0.051167 0.104208 0.011542 0.121231 0.131248 0.127898 0.139362 0.099485 0.269504 ... 0.000124 0.000205 0.003729 0.000930 0.000641 0.000094 0.000607 0.001961 0.000229 0.001850
4 196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 ... 0.001029 0.000737 0.013662 0.009383 0.001954 0.000820 0.003130 0.005600 0.002189 0.010702

5 rows × 14613 columns


In [5]:
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc

In [6]:
# load the targets
all_targets = pd.read_csv('target.csv')
all_targets.head()


Out[6]:
#oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 25 24.653061 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
1 176 25 NaN 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
2 177 25 33.265306 45.147059 9.411765 22.441176 1.676471 0.000000 0.705882 2.735294 ... 4.970588 4.470588 3.823529 2.176471 4.235294 3.558824 1.147059 4.470588 2.441176 18.794118
3 196 25 6.877551 51.000000 2.000000 15.000000 1.500000 0.000000 2.178571 7.035714 ... 3.678571 7.642857 7.750000 6.964286 3.178571 5.071429 1.357143 2.535714 11.000000 7.392857
4 239 25 21.163265 51.411765 0.735294 8.705882 2.558824 1.088235 2.558824 2.617647 ... 4.235294 4.264706 4.705882 0.588235 0.205882 3.617647 5.323529 7.470588 6.294118 11.411765

5 rows × 23 columns


In [7]:
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
features = features[~features.CID.isin(test_CIDs)] # remove test data

In [8]:
features['has_int'] = 1- all_targets['INTENSITY/STRENGTH'].isnull().values
features.head()


Out[8]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91411526_2 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2 has_int
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 1
1 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 0
2 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000092 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932 1
4 196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 ... 0.000737 0.013662 0.009383 0.001954 0.000820 0.003130 0.005600 0.002189 0.010702 1
5 239 0.102724 0.184880 0.020081 0.173157 0.187816 0.169183 0.193197 0.108919 0.503546 ... 0.000353 0.013443 0.002281 0.001322 0.000230 0.000844 0.001863 0.000439 0.006400 1

5 rows × 14614 columns


In [9]:
#load splits
trainsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_train_big.csv"),header=None)
testsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_test_big.csv"),header=None)

In [ ]:
rfc = RandomForestRegressor(n_estimators=10,max_features='auto',
                                    oob_score=False,n_jobs=1,random_state=0) 

regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')

# predict LB with different numbe r of features
for k in range(10): # iterate the splits
    print k
    # set a cv split as holdout data
    lb_CIDs = testsplits.ix[k,:].values

    for feature_number in [1,2,4,3,5,10,33,100,333,1000,3333,10000]: # iterate the number of features
        

            
        print('feature number',feature_number)
        sys.stdout.flush()

        

        train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # exclude lb targets from training
        train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
        test_features = features[features.CID.isin(lb_CIDs)] 

        # set the regressor
        


        result = []
        result_RF = []
        for idx in range(21): #iterate through descriptors

            #print(descriptor[idx])

            # load the scores for the descriptor
            scores = pd.read_csv('scores/LB_scores_morgan' + str(k) + '/scores_' + str(idx) + '.csv',index_col=0)

           
            X_all = train_features[scores.sort_values(by='0',ascending=0)[:feature_number].index].copy() # set X values with the best features
            
            X_all['CID'] = train_features.CID # add the CIDs as a column
            

            for CID in lb_CIDs: #iterate through all crossvalidation CID and predict them one by one

                Y_train = train_targets[['#oID',descriptor[idx]]]

                Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
                
                X = X_all[X_all.CID.isin(Y_train['#oID'])]
                weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)] # set the weights for the given CID
                
                
                if idx == 0: # if predicting intensity, use 1/1000 dilutions has_int == 1
                    test_data = test_features[test_features.has_int == 1]
                    test_data = test_data[test_data.CID == CID]
                    test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]

                else: # otherwise use high dilution data (already what we have in the matrix)
                    #test_data = test_features[test_features.Intensity == 1]
                    test_data = test_features[test_features.CID == CID]
                    test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]


                # in case the data frame lenght is zero, dont try to predict (the molecules with no 1/1000 dilutions)
                if len(test_data) == 0:
                    print 'no target at CID ',CID
                else:
                    
                    #linear regression
                    regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
                    Y_test = regr.predict(test_data)
                    std = -(Y_test**2)/2500.0+Y_test/25.0
                    result.append([CID, descriptor[idx], Y_test,std])
                    
 
                    #random forest regression
    
                    if (CID ==807) and (idx==0): # CID 807 (I2) has only one molecule with similarity above 0, weights screw up the random forest prediction
                        print k,idx, CID,'is 807, weights set to zero for random forest intensity prediction for this molecule'
                        weight.values[:] = 1
                
                    rfc.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
                    Y_test = rfc.predict(test_data)
                    std = -(Y_test**2)/2500.0+Y_test/25.0
                    result_RF.append([CID, descriptor[idx], Y_test,std])
                    
        print 'length ', len(result)
        result = pd.DataFrame(result)
        result.columns = ['#oID', 'descriptor', 'value', 'sigma']
        result.value = result.value.astype(float)
        result.sigma = result.sigma.astype(float)

       
        result_RF = pd.DataFrame(result_RF)
        result_RF.columns = ['#oID', 'descriptor', 'value', 'sigma']
        result_RF.value = result_RF.value.astype(float)
        result_RF.sigma = result_RF.sigma.astype(float)

        # remove negative data and data above 100

        result.loc[result.value < 0,'value'] = 0 
        result.loc[result.value > 100,'value'] = 100
        result.loc[result.sigma < 0,'sigma'] = 0

        result_RF.loc[result_RF.value < 0,'value'] = 0 
        result_RF.loc[result_RF.value > 100,'value'] = 100
        result_RF.loc[result_RF.sigma < 0,'sigma'] = 0

        #result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
        result.to_csv('results_morgan/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)
        result_RF.to_csv('results_morgan_RF/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)

In [ ]: