In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os

In [3]:
# load lb, test and CV CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]

In [4]:
features = pd.read_csv('features.csv', index_col=0)
features.head()


Out[4]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2 neglog10d Intensity
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 1 1
1 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 3 0
2 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 5 1
3 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 7 0
4 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932 3 1

5 rows × 14615 columns


In [5]:
features.shape


Out[5]:
(952, 14615)

In [6]:
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc

In [7]:
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()


Out[7]:
#oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 25 49.551020 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
1 126 25 24.653061 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
2 176 25 11.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
3 176 25 4.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
4 177 25 33.265306 45.147059 9.411765 22.441176 1.676471 0.000000 0.705882 2.735294 ... 4.970588 4.470588 3.823529 2.176471 4.235294 3.558824 1.147059 4.470588 2.441176 18.794118

5 rows × 23 columns


In [8]:
all_targets.shape


Out[8]:
(814, 23)

In [ ]:
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
features = features[~features.CID.isin(test_CIDs)] # remove test data 

print(targets.shape,features.shape)

   
train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # remove lb_data 
train_features = features[~features.CID.isin(lb_CIDs)]

print(train_targets.shape,train_features.shape)
    
    #feature selection
    
for idx in range(21):
    print('selection for descriptor: ' + descriptor[idx])
    sys.stdout.flush()
    Y = train_targets[descriptor[idx]]
    X = train_features.ix[:,1:]
    selector = RandomizedLasso(alpha=0.025,selection_threshold=0.001,verbose=1,n_jobs=2,n_resampling=200,
                               random_state=12).fit(X,Y)

    scores = pd.DataFrame(selector.scores_,index=X.columns)
    scores.to_csv('LB_scores/scores_'+str(idx)+'.csv')

In [ ]: