calculates the feature scores for each split


In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os

In [4]:
# load lb, test and CV CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]

In [5]:
features = pd.read_csv('features.csv')
features.head()


Out[5]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91305518_2 91411526_2 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.000013 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080
1 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.000124 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162
2 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000014 0.000092 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932
3 180 0.051167 0.104208 0.011542 0.121231 0.131248 0.127898 0.139362 0.099485 0.269504 ... 0.000124 0.000205 0.003729 0.000930 0.000641 0.000094 0.000607 0.001961 0.000229 0.001850
4 196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 ... 0.001029 0.000737 0.013662 0.009383 0.001954 0.000820 0.003130 0.005600 0.002189 0.010702

5 rows × 14613 columns


In [9]:
trainsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_train_big.csv"),header=None)
testsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_test_big.csv"),header=None)

In [10]:
features.shape


Out[10]:
(476, 14613)

In [11]:
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc

In [13]:
targets = pd.read_csv('targets_for_feature_selection.csv')
targets.columns = ['CID'] + list(targets.columns.values[1:])
targets.head()


Out[13]:
CID INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 37.102041 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
1 176 8.051020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
2 177 22.387755 48.418182 9.363636 19.781818 3.000000 0.763636 1.254545 2.472727 6.709091 ... 3.563636 3.218182 6.218182 1.945455 2.727273 3.872727 0.727273 3.454545 4.090909 14.200000
3 196 14.530612 44.304348 1.304348 9.804348 0.913043 0.500000 3.239130 7.108696 2.152174 ... 5.543478 6.695652 9.043478 7.304348 2.152174 4.217391 1.195652 1.543478 6.695652 7.847826
4 239 24.683673 51.724138 1.362069 13.500000 4.293103 1.482759 4.534483 6.189655 4.965517 ... 3.241379 5.068966 6.534483 0.793103 0.931034 5.413793 3.120690 5.775862 9.396552 10.862069

5 rows × 22 columns


In [14]:
targets.head()


Out[14]:
CID INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 37.102041 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
1 176 8.051020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
2 177 22.387755 48.418182 9.363636 19.781818 3.000000 0.763636 1.254545 2.472727 6.709091 ... 3.563636 3.218182 6.218182 1.945455 2.727273 3.872727 0.727273 3.454545 4.090909 14.200000
3 196 14.530612 44.304348 1.304348 9.804348 0.913043 0.500000 3.239130 7.108696 2.152174 ... 5.543478 6.695652 9.043478 7.304348 2.152174 4.217391 1.195652 1.543478 6.695652 7.847826
4 239 24.683673 51.724138 1.362069 13.500000 4.293103 1.482759 4.534483 6.189655 4.965517 ... 3.241379 5.068966 6.534483 0.793103 0.931034 5.413793 3.120690 5.775862 9.396552 10.862069

5 rows × 22 columns


In [ ]:
for k in range(10):
    # set a cv split as holdout data.
    
    lb_CIDs = testsplits.ix[k,:].values
    
    
    features = features[~features.CID.isin(test_CIDs)] # remove test data features

    #print(targets.shape,features.shape)


    train_targets = targets[~targets['CID'].isin(lb_CIDs)]  # remove lb_data 
    train_features = features[~features.CID.isin(lb_CIDs)] # remove lb_data 

    #print(train_targets.shape,train_features.shape)

        #feature selection

    for idx in range(21):
        if k < 0 and idx < 0:  # in case the selection stops at a point, set the right numbers to continue (takes time)
            pass
        else:
            print('split ' + str(k))
            print('selection for descriptor: ' + descriptor[idx])
            sys.stdout.flush()
            Y = train_targets[descriptor[idx]]
            X = train_features.ix[:,1:]

            selector = RandomizedLasso(alpha=0.025,selection_threshold=0.001,verbose=1,n_resampling=200,
                                      random_state=12).fit(X,Y)

            scores = pd.DataFrame(selector.scores_,index=X.columns)
            scores.to_csv('scores/LB_scores_morgan' + str(k) + '/scores_'+str(idx)+'.csv')

In [ ]: