In [2]:
import numpy as np
import pandas as pd
import os
from sklearn import linear_model
from scipy import stats as stats

In [3]:
# load lb and test CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]

In [4]:
# load morgan matrix to use them as weights in training
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../data/morgan_sim.csv"), index_col=0)
weights = morgan[morgan.index.astype(str)]
weights = pd.concat((weights,weights)).sort()
print weights.shape
weights.head()


(952, 476)
Out[4]:
126 176 177 180 196 239 240 241 243 244 ... 5366244 5367698 5367706 5368076 5371102 6114390 6429333 6999977 10857465 16220109
0
126 1.000000 0.108108 0.171429 0.054054 0.066667 0.090909 0.509091 0.166667 0.315789 0.290909 ... 0.033613 0.183673 0.195652 0.051948 0.252632 0.237288 0.307692 0.066667 0.000000 0.050633
126 1.000000 0.108108 0.171429 0.054054 0.066667 0.090909 0.509091 0.166667 0.315789 0.290909 ... 0.033613 0.183673 0.195652 0.051948 0.252632 0.237288 0.307692 0.066667 0.000000 0.050633
176 0.108108 1.000000 0.285714 0.625000 0.256410 0.434783 0.058824 0.000000 0.277778 0.058824 ... 0.081633 0.103896 0.112676 0.142857 0.135135 0.082474 0.136364 0.256410 0.027397 0.172414
176 0.108108 1.000000 0.285714 0.625000 0.256410 0.434783 0.058824 0.000000 0.277778 0.058824 ... 0.081633 0.103896 0.112676 0.142857 0.135135 0.082474 0.136364 0.256410 0.027397 0.172414
177 0.171429 0.285714 1.000000 0.285714 0.054054 0.095238 0.187500 0.000000 0.058824 0.000000 ... 0.041667 0.080000 0.086957 0.111111 0.083333 0.084211 0.238095 0.108108 0.028169 0.142857

5 rows × 476 columns


In [5]:
#load the features
features = pd.read_csv('features.csv', index_col=0)
features.head()


Out[5]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2 neglog10d Intensity
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 1 1
1 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 3 0
2 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 5 1
3 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 7 0
4 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932 3 1

5 rows × 14615 columns


In [6]:
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc

In [7]:
# load the targets
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()


Out[7]:
#oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 25 49.551020 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
1 126 25 24.653061 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
2 176 25 11.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
3 176 25 4.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
4 177 25 33.265306 45.147059 9.411765 22.441176 1.676471 0.000000 0.705882 2.735294 ... 4.970588 4.470588 3.823529 2.176471 4.235294 3.558824 1.147059 4.470588 2.441176 18.794118

5 rows × 23 columns


In [8]:
#load the best feature numbers
best_feature_numbers = pd.read_csv('best_feature_numbers_CV.csv',index_col=0, header=None)
best_feature_numbers


Out[8]:
1
0
INTENSITY/STRENGTH 500
VALENCE/PLEASANTNESS 200
BAKERY 2000
SWEET 300
FRUIT 200
FISH 20
GARLIC 10
SPICES 2000
COLD 300
SOUR 50
BURNT 50
ACID 200
WARM 500
MUSKY 800
SWEATY 2000
AMMONIA/URINOUS 400
DECAYED 400
WOOD 50
GRASS 1000
FLOWER 2000
CHEMICAL 2000

In [9]:
# predict LB
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
features = features[~features.CID.isin(test_CIDs)] # remove test data 

train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # exclude lb targets from training
train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
test_features = features[features.CID.isin(lb_CIDs)] 

# set the regressor
regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')


result = []
for idx in range(21):

    print(descriptor[idx])
    
    # load the scores for the descriptor
    scores = pd.read_csv('LB_scores/scores_' + str(idx) + '.csv',index_col=0)


    feature_number = int(best_feature_numbers.values[idx]) # set the number of best features to be used 
    
    X_all = train_features[scores.sort('0',ascending=0)[:feature_number].index] # set X values with the best features
    X_all['CID'] = train_features.CID # add the CIDs as a column

    for CID in lb_CIDs:

        Y_train = train_targets[['#oID',descriptor[idx]]]

        Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
        X = X_all[X_all.CID.isin(Y_train['#oID'])]
        weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)]

        regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)

        if idx == 0: # if predicting intensity, use 1/1000 dilutions (neglog10 of 1/1000 is 3)
            test_data = test_features[test_features.neglog10d == 3]
            test_data = test_data[test_data.CID == CID]
            test_data = test_data[scores.sort('0',ascending=0)[:feature_number].index]

        else: # otherwise use high dilution data (not that they differ in this target matrix from the low ones)
            test_data = test_features[test_features.Intensity == 1]
            test_data = test_data[test_data.CID == CID]
            test_data = test_data[scores.sort('0',ascending=0)[:feature_number].index]

        Y_test = regr.predict(test_data)
        std = -(Y_test**2)/2500.0+Y_test/25.0
        result.append([CID, descriptor[idx], Y_test,std])

result = pd.DataFrame(result)
result.columns = ['#oID', 'descriptor', 'value', 'sigma']


INTENSITY/STRENGTH
VALENCE/PLEASANTNESS
BAKERY
SWEET
FRUIT
FISH
GARLIC
SPICES
COLD
SOUR
BURNT
ACID
WARM
MUSKY
SWEATY
AMMONIA/URINOUS
DECAYED
WOOD
GRASS
FLOWER
CHEMICAL

In [10]:
result.value = result.value.astype(float)
result.sigma = result.sigma.astype(float)

# remove negative data and data above 100

result.value[result.value < 0] = 0 
result.value[result.value > 100] = 100

result.sigma[result.sigma < 0] = 0


#result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
result.to_csv('subchallenge2.txt',sep='\t',index =0)

In [ ]: