calculates the feature scores for each split



In [1]:

    
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os



In [4]:

    
# load lb, test and CV CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]



In [5]:

    
features = pd.read_csv('features.csv')
features.head()









    Out[5]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      91305518_2
      91411526_2
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
    
  
  
    
      0
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      ...
      0.000013
      0.000331
      0.014024
      0.000296
      0.021098
      0.000186
      0.003159
      0.002299
      0.000138
      0.011080
    
    
      1
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      ...
      0.000124
      0.000205
      0.008391
      0.000930
      0.001442
      0.000094
      0.000607
      0.001362
      0.000229
      0.004162
    
    
      2
      177
      0.020039
      0.067721
      0.015501
      0.075556
      0.083688
      0.078074
      0.089782
      0.106346
      0.382979
      ...
      0.000014
      0.000092
      0.000961
      0.000339
      0.000657
      0.000008
      0.000098
      0.000221
      0.000037
      0.001932
    
    
      3
      180
      0.051167
      0.104208
      0.011542
      0.121231
      0.131248
      0.127898
      0.139362
      0.099485
      0.269504
      ...
      0.000124
      0.000205
      0.003729
      0.000930
      0.000641
      0.000094
      0.000607
      0.001961
      0.000229
      0.001850
    
    
      4
      196
      0.221790
      0.333247
      0.023779
      0.306622
      0.308572
      0.294339
      0.305729
      0.138079
      0.539007
      ...
      0.001029
      0.000737
      0.013662
      0.009383
      0.001954
      0.000820
      0.003130
      0.005600
      0.002189
      0.010702
    
  

5 rows × 14613 columns



In [9]:

    
trainsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_train_big.csv"),header=None)
testsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_test_big.csv"),header=None)



In [10]:

    
features.shape









    Out[10]:





(476, 14613)



In [11]:

    
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc



In [13]:

    
targets = pd.read_csv('targets_for_feature_selection.csv')
targets.columns = ['CID'] + list(targets.columns.values[1:])
targets.head()









    Out[13]:






  
    
      
      CID
      INTENSITY/STRENGTH
      VALENCE/PLEASANTNESS
      BAKERY
      SWEET
      FRUIT
      FISH
      GARLIC
      SPICES
      COLD
      ...
      ACID
      WARM
      MUSKY
      SWEATY
      AMMONIA/URINOUS
      DECAYED
      WOOD
      GRASS
      FLOWER
      CHEMICAL
    
  
  
    
      0
      126
      37.102041
      50.081081
      0.500000
      21.959459
      7.405405
      0.175676
      2.162162
      4.554054
      4.662162
      ...
      4.094595
      2.486486
      7.216216
      1.391892
      2.554054
      4.675676
      0.891892
      1.662162
      8.094595
      15.283784
    
    
      1
      176
      8.051020
      45.344828
      2.275862
      5.103448
      1.137931
      0.000000
      6.448276
      5.965517
      4.793103
      ...
      3.896552
      5.448276
      6.448276
      3.551724
      3.275862
      4.275862
      2.413793
      2.482759
      6.724138
      7.724138
    
    
      2
      177
      22.387755
      48.418182
      9.363636
      19.781818
      3.000000
      0.763636
      1.254545
      2.472727
      6.709091
      ...
      3.563636
      3.218182
      6.218182
      1.945455
      2.727273
      3.872727
      0.727273
      3.454545
      4.090909
      14.200000
    
    
      3
      196
      14.530612
      44.304348
      1.304348
      9.804348
      0.913043
      0.500000
      3.239130
      7.108696
      2.152174
      ...
      5.543478
      6.695652
      9.043478
      7.304348
      2.152174
      4.217391
      1.195652
      1.543478
      6.695652
      7.847826
    
    
      4
      239
      24.683673
      51.724138
      1.362069
      13.500000
      4.293103
      1.482759
      4.534483
      6.189655
      4.965517
      ...
      3.241379
      5.068966
      6.534483
      0.793103
      0.931034
      5.413793
      3.120690
      5.775862
      9.396552
      10.862069
    
  

5 rows × 22 columns



In [14]:

    
targets.head()









    Out[14]:






  
    
      
      CID
      INTENSITY/STRENGTH
      VALENCE/PLEASANTNESS
      BAKERY
      SWEET
      FRUIT
      FISH
      GARLIC
      SPICES
      COLD
      ...
      ACID
      WARM
      MUSKY
      SWEATY
      AMMONIA/URINOUS
      DECAYED
      WOOD
      GRASS
      FLOWER
      CHEMICAL
    
  
  
    
      0
      126
      37.102041
      50.081081
      0.500000
      21.959459
      7.405405
      0.175676
      2.162162
      4.554054
      4.662162
      ...
      4.094595
      2.486486
      7.216216
      1.391892
      2.554054
      4.675676
      0.891892
      1.662162
      8.094595
      15.283784
    
    
      1
      176
      8.051020
      45.344828
      2.275862
      5.103448
      1.137931
      0.000000
      6.448276
      5.965517
      4.793103
      ...
      3.896552
      5.448276
      6.448276
      3.551724
      3.275862
      4.275862
      2.413793
      2.482759
      6.724138
      7.724138
    
    
      2
      177
      22.387755
      48.418182
      9.363636
      19.781818
      3.000000
      0.763636
      1.254545
      2.472727
      6.709091
      ...
      3.563636
      3.218182
      6.218182
      1.945455
      2.727273
      3.872727
      0.727273
      3.454545
      4.090909
      14.200000
    
    
      3
      196
      14.530612
      44.304348
      1.304348
      9.804348
      0.913043
      0.500000
      3.239130
      7.108696
      2.152174
      ...
      5.543478
      6.695652
      9.043478
      7.304348
      2.152174
      4.217391
      1.195652
      1.543478
      6.695652
      7.847826
    
    
      4
      239
      24.683673
      51.724138
      1.362069
      13.500000
      4.293103
      1.482759
      4.534483
      6.189655
      4.965517
      ...
      3.241379
      5.068966
      6.534483
      0.793103
      0.931034
      5.413793
      3.120690
      5.775862
      9.396552
      10.862069
    
  

5 rows × 22 columns



In [ ]:

    
for k in range(10):
    # set a cv split as holdout data.
    
    lb_CIDs = testsplits.ix[k,:].values
    
    
    features = features[~features.CID.isin(test_CIDs)] # remove test data features

    #print(targets.shape,features.shape)


    train_targets = targets[~targets['CID'].isin(lb_CIDs)]  # remove lb_data 
    train_features = features[~features.CID.isin(lb_CIDs)] # remove lb_data 

    #print(train_targets.shape,train_features.shape)

        #feature selection

    for idx in range(21):
        if k < 0 and idx < 0:  # in case the selection stops at a point, set the right numbers to continue (takes time)
            pass
        else:
            print('split ' + str(k))
            print('selection for descriptor: ' + descriptor[idx])
            sys.stdout.flush()
            Y = train_targets[descriptor[idx]]
            X = train_features.ix[:,1:]

            selector = RandomizedLasso(alpha=0.025,selection_threshold=0.001,verbose=1,n_resampling=200,
                                      random_state=12).fit(X,Y)

            scores = pd.DataFrame(selector.scores_,index=X.columns)
            scores.to_csv('scores/LB_scores_morgan' + str(k) + '/scores_'+str(idx)+'.csv')



In [ ]:

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	91305518_2	91411526_2	91541756_2	91552833_2	91563027_2	91595028_2	91614181_2	91617014_2	91617930_2	91618238_2
0	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.000013	0.000331	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080
1	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.000124	0.000205	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162
2	177	0.020039	0.067721	0.015501	0.075556	0.083688	0.078074	0.089782	0.106346	0.382979	...	0.000014	0.000092	0.000961	0.000339	0.000657	0.000008	0.000098	0.000221	0.000037	0.001932
3	180	0.051167	0.104208	0.011542	0.121231	0.131248	0.127898	0.139362	0.099485	0.269504	...	0.000124	0.000205	0.003729	0.000930	0.000641	0.000094	0.000607	0.001961	0.000229	0.001850
4	196	0.221790	0.333247	0.023779	0.306622	0.308572	0.294339	0.305729	0.138079	0.539007	...	0.001029	0.000737	0.013662	0.009383	0.001954	0.000820	0.003130	0.005600	0.002189	0.010702

	CID	INTENSITY/STRENGTH	VALENCE/PLEASANTNESS	BAKERY	SWEET	FRUIT	FISH	GARLIC	SPICES	COLD	...	ACID	WARM	MUSKY	SWEATY	AMMONIA/URINOUS	DECAYED	WOOD	GRASS	FLOWER	CHEMICAL
0	126	37.102041	50.081081	0.500000	21.959459	7.405405	0.175676	2.162162	4.554054	4.662162	...	4.094595	2.486486	7.216216	1.391892	2.554054	4.675676	0.891892	1.662162	8.094595	15.283784
1	176	8.051020	45.344828	2.275862	5.103448	1.137931	0.000000	6.448276	5.965517	4.793103	...	3.896552	5.448276	6.448276	3.551724	3.275862	4.275862	2.413793	2.482759	6.724138	7.724138
2	177	22.387755	48.418182	9.363636	19.781818	3.000000	0.763636	1.254545	2.472727	6.709091	...	3.563636	3.218182	6.218182	1.945455	2.727273	3.872727	0.727273	3.454545	4.090909	14.200000
3	196	14.530612	44.304348	1.304348	9.804348	0.913043	0.500000	3.239130	7.108696	2.152174	...	5.543478	6.695652	9.043478	7.304348	2.152174	4.217391	1.195652	1.543478	6.695652	7.847826
4	239	24.683673	51.724138	1.362069	13.500000	4.293103	1.482759	4.534483	6.189655	4.965517	...	3.241379	5.068966	6.534483	0.793103	0.931034	5.413793	3.120690	5.775862	9.396552	10.862069