notebook.community

Edit and run



In [2]:

    
import numpy as np
import pandas as pd
import os
from sklearn import linear_model
from scipy import stats as stats



In [3]:

    
# load lb and test CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]



In [4]:

    
# load morgan matrix to use them as weights in training
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../data/morgan_sim.csv"), index_col=0)
weights = morgan[morgan.index.astype(str)]
weights = pd.concat((weights,weights)).sort()
print weights.shape
weights.head()









    



(952, 476)






    Out[4]:






  
    
      
      126
      176
      177
      180
      196
      239
      240
      241
      243
      244
      ...
      5366244
      5367698
      5367706
      5368076
      5371102
      6114390
      6429333
      6999977
      10857465
      16220109
    
    
      0
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
       1.000000
       0.108108
       0.171429
       0.054054
       0.066667
       0.090909
       0.509091
       0.166667
       0.315789
       0.290909
      ...
       0.033613
       0.183673
       0.195652
       0.051948
       0.252632
       0.237288
       0.307692
       0.066667
       0.000000
       0.050633
    
    
      126
       1.000000
       0.108108
       0.171429
       0.054054
       0.066667
       0.090909
       0.509091
       0.166667
       0.315789
       0.290909
      ...
       0.033613
       0.183673
       0.195652
       0.051948
       0.252632
       0.237288
       0.307692
       0.066667
       0.000000
       0.050633
    
    
      176
       0.108108
       1.000000
       0.285714
       0.625000
       0.256410
       0.434783
       0.058824
       0.000000
       0.277778
       0.058824
      ...
       0.081633
       0.103896
       0.112676
       0.142857
       0.135135
       0.082474
       0.136364
       0.256410
       0.027397
       0.172414
    
    
      176
       0.108108
       1.000000
       0.285714
       0.625000
       0.256410
       0.434783
       0.058824
       0.000000
       0.277778
       0.058824
      ...
       0.081633
       0.103896
       0.112676
       0.142857
       0.135135
       0.082474
       0.136364
       0.256410
       0.027397
       0.172414
    
    
      177
       0.171429
       0.285714
       1.000000
       0.285714
       0.054054
       0.095238
       0.187500
       0.000000
       0.058824
       0.000000
      ...
       0.041667
       0.080000
       0.086957
       0.111111
       0.083333
       0.084211
       0.238095
       0.108108
       0.028169
       0.142857
    
  

5 rows × 476 columns



In [5]:

    
#load the features
features = pd.read_csv('features.csv', index_col=0)
features.head()









    Out[5]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
      neglog10d
      Intensity
    
  
  
    
      0
       126
       0.181128
       0.270753
       0.030587
       0.262264
       0.219126
       0.253846
       0.214989
       0.216981
       0.425532
      ...
       0.014024
       0.000296
       0.021098
       0.000186
       0.003159
       0.002299
       0.000138
       0.011080
       1
       1
    
    
      1
       126
       0.181128
       0.270753
       0.030587
       0.262264
       0.219126
       0.253846
       0.214989
       0.216981
       0.425532
      ...
       0.014024
       0.000296
       0.021098
       0.000186
       0.003159
       0.002299
       0.000138
       0.011080
       3
       0
    
    
      2
       176
       0.060311
       0.109331
       0.025411
       0.096943
       0.105579
       0.090940
       0.107335
       0.125214
       0.659574
      ...
       0.008391
       0.000930
       0.001442
       0.000094
       0.000607
       0.001362
       0.000229
       0.004162
       5
       1
    
    
      3
       176
       0.060311
       0.109331
       0.025411
       0.096943
       0.105579
       0.090940
       0.107335
       0.125214
       0.659574
      ...
       0.008391
       0.000930
       0.001442
       0.000094
       0.000607
       0.001362
       0.000229
       0.004162
       7
       0
    
    
      4
       177
       0.020039
       0.067721
       0.015501
       0.075556
       0.083688
       0.078074
       0.089782
       0.106346
       0.382979
      ...
       0.000961
       0.000339
       0.000657
       0.000008
       0.000098
       0.000221
       0.000037
       0.001932
       3
       1
    
  

5 rows × 14615 columns



In [6]:

    
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc



In [7]:

    
# load the targets
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()









    Out[7]:






  
    
      
      #oID
      individual
      INTENSITY/STRENGTH
      VALENCE/PLEASANTNESS
      BAKERY
      SWEET
      FRUIT
      FISH
      GARLIC
      SPICES
      ...
      ACID
      WARM
      MUSKY
      SWEATY
      AMMONIA/URINOUS
      DECAYED
      WOOD
      GRASS
      FLOWER
      CHEMICAL
    
  
  
    
      0
       126
       25
       49.551020
       49.465116
       0.674419
       25.953488
       6.581395
       0.302326
        1.720930
       3.906977
      ...
       3.046512
       0.790698
       8.023256
       1.604651
       1.209302
       5.069767
       1.348837
       1.441860
       9.906977
       14.813953
    
    
      1
       126
       25
       24.653061
       49.465116
       0.674419
       25.953488
       6.581395
       0.302326
        1.720930
       3.906977
      ...
       3.046512
       0.790698
       8.023256
       1.604651
       1.209302
       5.069767
       1.348837
       1.441860
       9.906977
       14.813953
    
    
      2
       176
       25
       11.551020
       45.944444
       3.666667
        8.166667
       1.777778
       0.000000
       10.388889
       6.055556
      ...
       4.166667
       6.111111
       8.666667
       2.166667
       5.222222
       4.388889
       2.611111
       2.166667
       5.944444
        4.222222
    
    
      3
       176
       25
        4.551020
       45.944444
       3.666667
        8.166667
       1.777778
       0.000000
       10.388889
       6.055556
      ...
       4.166667
       6.111111
       8.666667
       2.166667
       5.222222
       4.388889
       2.611111
       2.166667
       5.944444
        4.222222
    
    
      4
       177
       25
       33.265306
       45.147059
       9.411765
       22.441176
       1.676471
       0.000000
        0.705882
       2.735294
      ...
       4.970588
       4.470588
       3.823529
       2.176471
       4.235294
       3.558824
       1.147059
       4.470588
       2.441176
       18.794118
    
  

5 rows × 23 columns



In [8]:

    
#load the best feature numbers
best_feature_numbers = pd.read_csv('best_feature_numbers_CV.csv',index_col=0, header=None)
best_feature_numbers









    Out[8]:






  
    
      
      1
    
    
      0
      
    
  
  
    
      INTENSITY/STRENGTH
        500
    
    
      VALENCE/PLEASANTNESS
        200
    
    
      BAKERY
       2000
    
    
      SWEET
        300
    
    
      FRUIT
        200
    
    
      FISH
         20
    
    
      GARLIC
         10
    
    
      SPICES
       2000
    
    
      COLD
        300
    
    
      SOUR
         50
    
    
      BURNT
         50
    
    
      ACID
        200
    
    
      WARM
        500
    
    
      MUSKY
        800
    
    
      SWEATY
       2000
    
    
      AMMONIA/URINOUS
        400
    
    
      DECAYED
        400
    
    
      WOOD
         50
    
    
      GRASS
       1000
    
    
      FLOWER
       2000
    
    
      CHEMICAL
       2000



In [9]:

    
# predict LB
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
features = features[~features.CID.isin(test_CIDs)] # remove test data 

train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # exclude lb targets from training
train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
test_features = features[features.CID.isin(lb_CIDs)] 

# set the regressor
regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')


result = []
for idx in range(21):

    print(descriptor[idx])
    
    # load the scores for the descriptor
    scores = pd.read_csv('LB_scores/scores_' + str(idx) + '.csv',index_col=0)


    feature_number = int(best_feature_numbers.values[idx]) # set the number of best features to be used 
    
    X_all = train_features[scores.sort('0',ascending=0)[:feature_number].index] # set X values with the best features
    X_all['CID'] = train_features.CID # add the CIDs as a column

    for CID in lb_CIDs:

        Y_train = train_targets[['#oID',descriptor[idx]]]

        Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
        X = X_all[X_all.CID.isin(Y_train['#oID'])]
        weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)]

        regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)

        if idx == 0: # if predicting intensity, use 1/1000 dilutions (neglog10 of 1/1000 is 3)
            test_data = test_features[test_features.neglog10d == 3]
            test_data = test_data[test_data.CID == CID]
            test_data = test_data[scores.sort('0',ascending=0)[:feature_number].index]

        else: # otherwise use high dilution data (not that they differ in this target matrix from the low ones)
            test_data = test_features[test_features.Intensity == 1]
            test_data = test_data[test_data.CID == CID]
            test_data = test_data[scores.sort('0',ascending=0)[:feature_number].index]

        Y_test = regr.predict(test_data)
        std = -(Y_test**2)/2500.0+Y_test/25.0
        result.append([CID, descriptor[idx], Y_test,std])

result = pd.DataFrame(result)
result.columns = ['#oID', 'descriptor', 'value', 'sigma']









    



INTENSITY/STRENGTH
VALENCE/PLEASANTNESS
BAKERY
SWEET
FRUIT
FISH
GARLIC
SPICES
COLD
SOUR
BURNT
ACID
WARM
MUSKY
SWEATY
AMMONIA/URINOUS
DECAYED
WOOD
GRASS
FLOWER
CHEMICAL



In [10]:

    
result.value = result.value.astype(float)
result.sigma = result.sigma.astype(float)

# remove negative data and data above 100

result.value[result.value < 0] = 0 
result.value[result.value > 100] = 100

result.sigma[result.sigma < 0] = 0


#result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
result.to_csv('subchallenge2.txt',sep='\t',index =0)



In [ ]:

	126	176	177	180	196	239	240	241	243	244	...	5366244	5367698	5367706	5368076	5371102	6114390	6429333	6999977	10857465	16220109
0
126	1.000000	0.108108	0.171429	0.054054	0.066667	0.090909	0.509091	0.166667	0.315789	0.290909	...	0.033613	0.183673	0.195652	0.051948	0.252632	0.237288	0.307692	0.066667	0.000000	0.050633
126	1.000000	0.108108	0.171429	0.054054	0.066667	0.090909	0.509091	0.166667	0.315789	0.290909	...	0.033613	0.183673	0.195652	0.051948	0.252632	0.237288	0.307692	0.066667	0.000000	0.050633
176	0.108108	1.000000	0.285714	0.625000	0.256410	0.434783	0.058824	0.000000	0.277778	0.058824	...	0.081633	0.103896	0.112676	0.142857	0.135135	0.082474	0.136364	0.256410	0.027397	0.172414
176	0.108108	1.000000	0.285714	0.625000	0.256410	0.434783	0.058824	0.000000	0.277778	0.058824	...	0.081633	0.103896	0.112676	0.142857	0.135135	0.082474	0.136364	0.256410	0.027397	0.172414
177	0.171429	0.285714	1.000000	0.285714	0.054054	0.095238	0.187500	0.000000	0.058824	0.000000	...	0.041667	0.080000	0.086957	0.111111	0.083333	0.084211	0.238095	0.108108	0.028169	0.142857

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	91541756_2	91552833_2	91563027_2	91595028_2	91614181_2	91617014_2	91617930_2	91618238_2	neglog10d	Intensity
0	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080	1	1
1	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080	3	0
2	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162	5	1
3	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162	7	0
4	177	0.020039	0.067721	0.015501	0.075556	0.083688	0.078074	0.089782	0.106346	0.382979	...	0.000961	0.000339	0.000657	0.000008	0.000098	0.000221	0.000037	0.001932	3	1

	#oID	individual	INTENSITY/STRENGTH	VALENCE/PLEASANTNESS	BAKERY	SWEET	FRUIT	FISH	GARLIC	SPICES	...	ACID	WARM	MUSKY	SWEATY	AMMONIA/URINOUS	DECAYED	WOOD	GRASS	FLOWER	CHEMICAL
0	126	25	49.551020	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
1	126	25	24.653061	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
2	176	25	11.551020	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
3	176	25	4.551020	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
4	177	25	33.265306	45.147059	9.411765	22.441176	1.676471	0.000000	0.705882	2.735294	...	4.970588	4.470588	3.823529	2.176471	4.235294	3.558824	1.147059	4.470588	2.441176	18.794118

	1
0
INTENSITY/STRENGTH	500
VALENCE/PLEASANTNESS	200
BAKERY	2000
SWEET	300
FRUIT	200
FISH	20
GARLIC	10
SPICES	2000
COLD	300
SOUR	50
BURNT	50
ACID	200
WARM	500
MUSKY	800
SWEATY	2000
AMMONIA/URINOUS	400
DECAYED	400
WOOD	50
GRASS	1000
FLOWER	2000
CHEMICAL	2000