notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import os,sys
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

from scipy import stats as stats



In [2]:

    
# load lb, test and CV CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]



In [3]:

    
# load morgan matrix to use them as weights in training
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/morgan_sim.csv"), index_col=0)
weights = morgan[morgan.index.astype(str)]
print weights.shape
weights.head()









    



(476, 476)






    Out[3]:






  
    
      
      126
      176
      177
      180
      196
      239
      240
      241
      243
      244
      ...
      5366244
      5367698
      5367706
      5368076
      5371102
      6114390
      6429333
      6999977
      10857465
      16220109
    
    
      0
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      1.000000
      0.108108
      0.171429
      0.054054
      0.066667
      0.090909
      0.509091
      0.166667
      0.315789
      0.290909
      ...
      0.033613
      0.183673
      0.195652
      0.051948
      0.252632
      0.237288
      0.307692
      0.066667
      0.000000
      0.050633
    
    
      176
      0.108108
      1.000000
      0.285714
      0.625000
      0.256410
      0.434783
      0.058824
      0.000000
      0.277778
      0.058824
      ...
      0.081633
      0.103896
      0.112676
      0.142857
      0.135135
      0.082474
      0.136364
      0.256410
      0.027397
      0.172414
    
    
      177
      0.171429
      0.285714
      1.000000
      0.285714
      0.054054
      0.095238
      0.187500
      0.000000
      0.058824
      0.000000
      ...
      0.041667
      0.080000
      0.086957
      0.111111
      0.083333
      0.084211
      0.238095
      0.108108
      0.028169
      0.142857
    
    
      180
      0.054054
      0.625000
      0.285714
      1.000000
      0.153846
      0.260870
      0.058824
      0.000000
      0.166667
      0.000000
      ...
      0.081633
      0.103896
      0.112676
      0.178571
      0.108108
      0.103093
      0.136364
      0.256410
      0.054795
      0.206897
    
    
      196
      0.066667
      0.256410
      0.054054
      0.153846
      1.000000
      0.434783
      0.035088
      0.000000
      0.169492
      0.070175
      ...
      0.148760
      0.140000
      0.127660
      0.227848
      0.144330
      0.066667
      0.059701
      0.258065
      0.000000
      0.172840
    
  

5 rows × 476 columns



In [4]:

    
#load the features
features = pd.read_csv('features.csv')
features.head()









    Out[4]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      91305518_2
      91411526_2
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
    
  
  
    
      0
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      ...
      0.000013
      0.000331
      0.014024
      0.000296
      0.021098
      0.000186
      0.003159
      0.002299
      0.000138
      0.011080
    
    
      1
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      ...
      0.000124
      0.000205
      0.008391
      0.000930
      0.001442
      0.000094
      0.000607
      0.001362
      0.000229
      0.004162
    
    
      2
      177
      0.020039
      0.067721
      0.015501
      0.075556
      0.083688
      0.078074
      0.089782
      0.106346
      0.382979
      ...
      0.000014
      0.000092
      0.000961
      0.000339
      0.000657
      0.000008
      0.000098
      0.000221
      0.000037
      0.001932
    
    
      3
      180
      0.051167
      0.104208
      0.011542
      0.121231
      0.131248
      0.127898
      0.139362
      0.099485
      0.269504
      ...
      0.000124
      0.000205
      0.003729
      0.000930
      0.000641
      0.000094
      0.000607
      0.001961
      0.000229
      0.001850
    
    
      4
      196
      0.221790
      0.333247
      0.023779
      0.306622
      0.308572
      0.294339
      0.305729
      0.138079
      0.539007
      ...
      0.001029
      0.000737
      0.013662
      0.009383
      0.001954
      0.000820
      0.003130
      0.005600
      0.002189
      0.010702
    
  

5 rows × 14613 columns



In [5]:

    
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc



In [6]:

    
# load the targets
all_targets = pd.read_csv('target.csv')
all_targets.head()









    Out[6]:






  
    
      
      #oID
      individual
      INTENSITY/STRENGTH
      VALENCE/PLEASANTNESS
      BAKERY
      SWEET
      FRUIT
      FISH
      GARLIC
      SPICES
      ...
      ACID
      WARM
      MUSKY
      SWEATY
      AMMONIA/URINOUS
      DECAYED
      WOOD
      GRASS
      FLOWER
      CHEMICAL
    
  
  
    
      0
      126
      25
      24.653061
      49.465116
      0.674419
      25.953488
      6.581395
      0.302326
      1.720930
      3.906977
      ...
      3.046512
      0.790698
      8.023256
      1.604651
      1.209302
      5.069767
      1.348837
      1.441860
      9.906977
      14.813953
    
    
      1
      176
      25
      NaN
      45.944444
      3.666667
      8.166667
      1.777778
      0.000000
      10.388889
      6.055556
      ...
      4.166667
      6.111111
      8.666667
      2.166667
      5.222222
      4.388889
      2.611111
      2.166667
      5.944444
      4.222222
    
    
      2
      177
      25
      33.265306
      45.147059
      9.411765
      22.441176
      1.676471
      0.000000
      0.705882
      2.735294
      ...
      4.970588
      4.470588
      3.823529
      2.176471
      4.235294
      3.558824
      1.147059
      4.470588
      2.441176
      18.794118
    
    
      3
      196
      25
      6.877551
      51.000000
      2.000000
      15.000000
      1.500000
      0.000000
      2.178571
      7.035714
      ...
      3.678571
      7.642857
      7.750000
      6.964286
      3.178571
      5.071429
      1.357143
      2.535714
      11.000000
      7.392857
    
    
      4
      239
      25
      21.163265
      51.411765
      0.735294
      8.705882
      2.558824
      1.088235
      2.558824
      2.617647
      ...
      4.235294
      4.264706
      4.705882
      0.588235
      0.205882
      3.617647
      5.323529
      7.470588
      6.294118
      11.411765
    
  

5 rows × 23 columns



In [7]:

    
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
features = features[~features.CID.isin(test_CIDs)] # remove test data



In [8]:

    
features['has_int'] = 1- all_targets['INTENSITY/STRENGTH'].isnull().values
features.head()









    Out[8]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      91411526_2
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
      has_int
    
  
  
    
      0
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      ...
      0.000331
      0.014024
      0.000296
      0.021098
      0.000186
      0.003159
      0.002299
      0.000138
      0.011080
      1
    
    
      1
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      ...
      0.000205
      0.008391
      0.000930
      0.001442
      0.000094
      0.000607
      0.001362
      0.000229
      0.004162
      0
    
    
      2
      177
      0.020039
      0.067721
      0.015501
      0.075556
      0.083688
      0.078074
      0.089782
      0.106346
      0.382979
      ...
      0.000092
      0.000961
      0.000339
      0.000657
      0.000008
      0.000098
      0.000221
      0.000037
      0.001932
      1
    
    
      4
      196
      0.221790
      0.333247
      0.023779
      0.306622
      0.308572
      0.294339
      0.305729
      0.138079
      0.539007
      ...
      0.000737
      0.013662
      0.009383
      0.001954
      0.000820
      0.003130
      0.005600
      0.002189
      0.010702
      1
    
    
      5
      239
      0.102724
      0.184880
      0.020081
      0.173157
      0.187816
      0.169183
      0.193197
      0.108919
      0.503546
      ...
      0.000353
      0.013443
      0.002281
      0.001322
      0.000230
      0.000844
      0.001863
      0.000439
      0.006400
      1
    
  

5 rows × 14614 columns



In [9]:

    
#load splits
trainsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_train_big.csv"),header=None)
testsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_test_big.csv"),header=None)



In [ ]:

    
rfc = RandomForestRegressor(n_estimators=10,max_features='auto',
                                    oob_score=False,n_jobs=1,random_state=0) 

regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')

# predict LB with different numbe r of features
for k in range(10): # iterate the splits
    print k
    # set a cv split as holdout data
    lb_CIDs = testsplits.ix[k,:].values

    for feature_number in [1,2,4,3,5,10,33,100,333,1000,3333,10000]: # iterate the number of features
        

            
        print('feature number',feature_number)
        sys.stdout.flush()

        

        train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # exclude lb targets from training
        train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
        test_features = features[features.CID.isin(lb_CIDs)] 

        # set the regressor
        


        result = []
        result_RF = []
        for idx in range(21): #iterate through descriptors

            #print(descriptor[idx])

            # load the scores for the descriptor
            scores = pd.read_csv('scores/LB_scores_morgan' + str(k) + '/scores_' + str(idx) + '.csv',index_col=0)

           
            X_all = train_features[scores.sort_values(by='0',ascending=0)[:feature_number].index].copy() # set X values with the best features
            
            X_all['CID'] = train_features.CID # add the CIDs as a column
            

            for CID in lb_CIDs: #iterate through all crossvalidation CID and predict them one by one

                Y_train = train_targets[['#oID',descriptor[idx]]]

                Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
                
                X = X_all[X_all.CID.isin(Y_train['#oID'])]
                weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)] # set the weights for the given CID
                
                
                if idx == 0: # if predicting intensity, use 1/1000 dilutions has_int == 1
                    test_data = test_features[test_features.has_int == 1]
                    test_data = test_data[test_data.CID == CID]
                    test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]

                else: # otherwise use high dilution data (already what we have in the matrix)
                    #test_data = test_features[test_features.Intensity == 1]
                    test_data = test_features[test_features.CID == CID]
                    test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]


                # in case the data frame lenght is zero, dont try to predict (the molecules with no 1/1000 dilutions)
                if len(test_data) == 0:
                    print 'no target at CID ',CID
                else:
                    
                    #linear regression
                    regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
                    Y_test = regr.predict(test_data)
                    std = -(Y_test**2)/2500.0+Y_test/25.0
                    result.append([CID, descriptor[idx], Y_test,std])
                    
 
                    #random forest regression
    
                    if (CID ==807) and (idx==0): # CID 807 (I2) has only one molecule with similarity above 0, weights screw up the random forest prediction
                        print k,idx, CID,'is 807, weights set to zero for random forest intensity prediction for this molecule'
                        weight.values[:] = 1
                
                    rfc.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
                    Y_test = rfc.predict(test_data)
                    std = -(Y_test**2)/2500.0+Y_test/25.0
                    result_RF.append([CID, descriptor[idx], Y_test,std])
                    
        print 'length ', len(result)
        result = pd.DataFrame(result)
        result.columns = ['#oID', 'descriptor', 'value', 'sigma']
        result.value = result.value.astype(float)
        result.sigma = result.sigma.astype(float)

       
        result_RF = pd.DataFrame(result_RF)
        result_RF.columns = ['#oID', 'descriptor', 'value', 'sigma']
        result_RF.value = result_RF.value.astype(float)
        result_RF.sigma = result_RF.sigma.astype(float)

        # remove negative data and data above 100

        result.loc[result.value < 0,'value'] = 0 
        result.loc[result.value > 100,'value'] = 100
        result.loc[result.sigma < 0,'sigma'] = 0

        result_RF.loc[result_RF.value < 0,'value'] = 0 
        result_RF.loc[result_RF.value > 100,'value'] = 100
        result_RF.loc[result_RF.sigma < 0,'sigma'] = 0

        #result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
        result.to_csv('results_morgan/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)
        result_RF.to_csv('results_morgan_RF/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)



In [ ]:

	126	176	177	180	196	239	240	241	243	244	...	5366244	5367698	5367706	5368076	5371102	6114390	6429333	6999977	10857465	16220109
0
126	1.000000	0.108108	0.171429	0.054054	0.066667	0.090909	0.509091	0.166667	0.315789	0.290909	...	0.033613	0.183673	0.195652	0.051948	0.252632	0.237288	0.307692	0.066667	0.000000	0.050633
176	0.108108	1.000000	0.285714	0.625000	0.256410	0.434783	0.058824	0.000000	0.277778	0.058824	...	0.081633	0.103896	0.112676	0.142857	0.135135	0.082474	0.136364	0.256410	0.027397	0.172414
177	0.171429	0.285714	1.000000	0.285714	0.054054	0.095238	0.187500	0.000000	0.058824	0.000000	...	0.041667	0.080000	0.086957	0.111111	0.083333	0.084211	0.238095	0.108108	0.028169	0.142857
180	0.054054	0.625000	0.285714	1.000000	0.153846	0.260870	0.058824	0.000000	0.166667	0.000000	...	0.081633	0.103896	0.112676	0.178571	0.108108	0.103093	0.136364	0.256410	0.054795	0.206897
196	0.066667	0.256410	0.054054	0.153846	1.000000	0.434783	0.035088	0.000000	0.169492	0.070175	...	0.148760	0.140000	0.127660	0.227848	0.144330	0.066667	0.059701	0.258065	0.000000	0.172840

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	91305518_2	91411526_2	91541756_2	91552833_2	91563027_2	91595028_2	91614181_2	91617014_2	91617930_2	91618238_2
0	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.000013	0.000331	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080
1	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.000124	0.000205	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162
2	177	0.020039	0.067721	0.015501	0.075556	0.083688	0.078074	0.089782	0.106346	0.382979	...	0.000014	0.000092	0.000961	0.000339	0.000657	0.000008	0.000098	0.000221	0.000037	0.001932
3	180	0.051167	0.104208	0.011542	0.121231	0.131248	0.127898	0.139362	0.099485	0.269504	...	0.000124	0.000205	0.003729	0.000930	0.000641	0.000094	0.000607	0.001961	0.000229	0.001850
4	196	0.221790	0.333247	0.023779	0.306622	0.308572	0.294339	0.305729	0.138079	0.539007	...	0.001029	0.000737	0.013662	0.009383	0.001954	0.000820	0.003130	0.005600	0.002189	0.010702

	#oID	individual	INTENSITY/STRENGTH	VALENCE/PLEASANTNESS	BAKERY	SWEET	FRUIT	FISH	GARLIC	SPICES	...	ACID	WARM	MUSKY	SWEATY	AMMONIA/URINOUS	DECAYED	WOOD	GRASS	FLOWER	CHEMICAL
0	126	25	24.653061	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
1	176	25	NaN	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
2	177	25	33.265306	45.147059	9.411765	22.441176	1.676471	0.000000	0.705882	2.735294	...	4.970588	4.470588	3.823529	2.176471	4.235294	3.558824	1.147059	4.470588	2.441176	18.794118
3	196	25	6.877551	51.000000	2.000000	15.000000	1.500000	0.000000	2.178571	7.035714	...	3.678571	7.642857	7.750000	6.964286	3.178571	5.071429	1.357143	2.535714	11.000000	7.392857
4	239	25	21.163265	51.411765	0.735294	8.705882	2.558824	1.088235	2.558824	2.617647	...	4.235294	4.264706	4.705882	0.588235	0.205882	3.617647	5.323529	7.470588	6.294118	11.411765