notebook.community

Edit and run



In [2]:

    
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os



In [3]:

    
# load lb, test and CV CIDs

# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_leaderboard.txt")) as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]



In [4]:

    
features = pd.read_csv('features.csv', index_col=0)
features.head()









    Out[4]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
      neglog10d
      Intensity
    
  
  
    
      0
       126
       0.181128
       0.270753
       0.030587
       0.262264
       0.219126
       0.253846
       0.214989
       0.216981
       0.425532
      ...
       0.014024
       0.000296
       0.021098
       0.000186
       0.003159
       0.002299
       0.000138
       0.011080
       1
       1
    
    
      1
       126
       0.181128
       0.270753
       0.030587
       0.262264
       0.219126
       0.253846
       0.214989
       0.216981
       0.425532
      ...
       0.014024
       0.000296
       0.021098
       0.000186
       0.003159
       0.002299
       0.000138
       0.011080
       3
       0
    
    
      2
       176
       0.060311
       0.109331
       0.025411
       0.096943
       0.105579
       0.090940
       0.107335
       0.125214
       0.659574
      ...
       0.008391
       0.000930
       0.001442
       0.000094
       0.000607
       0.001362
       0.000229
       0.004162
       5
       1
    
    
      3
       176
       0.060311
       0.109331
       0.025411
       0.096943
       0.105579
       0.090940
       0.107335
       0.125214
       0.659574
      ...
       0.008391
       0.000930
       0.001442
       0.000094
       0.000607
       0.001362
       0.000229
       0.004162
       7
       0
    
    
      4
       177
       0.020039
       0.067721
       0.015501
       0.075556
       0.083688
       0.078074
       0.089782
       0.106346
       0.382979
      ...
       0.000961
       0.000339
       0.000657
       0.000008
       0.000098
       0.000221
       0.000037
       0.001932
       3
       1
    
  

5 rows × 14615 columns



In [5]:

    
features.shape









    Out[5]:





(952, 14615)



In [6]:

    
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc



In [7]:

    
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()









    Out[7]:






  
    
      
      #oID
      individual
      INTENSITY/STRENGTH
      VALENCE/PLEASANTNESS
      BAKERY
      SWEET
      FRUIT
      FISH
      GARLIC
      SPICES
      ...
      ACID
      WARM
      MUSKY
      SWEATY
      AMMONIA/URINOUS
      DECAYED
      WOOD
      GRASS
      FLOWER
      CHEMICAL
    
  
  
    
      0
       126
       25
       49.551020
       49.465116
       0.674419
       25.953488
       6.581395
       0.302326
        1.720930
       3.906977
      ...
       3.046512
       0.790698
       8.023256
       1.604651
       1.209302
       5.069767
       1.348837
       1.441860
       9.906977
       14.813953
    
    
      1
       126
       25
       24.653061
       49.465116
       0.674419
       25.953488
       6.581395
       0.302326
        1.720930
       3.906977
      ...
       3.046512
       0.790698
       8.023256
       1.604651
       1.209302
       5.069767
       1.348837
       1.441860
       9.906977
       14.813953
    
    
      2
       176
       25
       11.551020
       45.944444
       3.666667
        8.166667
       1.777778
       0.000000
       10.388889
       6.055556
      ...
       4.166667
       6.111111
       8.666667
       2.166667
       5.222222
       4.388889
       2.611111
       2.166667
       5.944444
        4.222222
    
    
      3
       176
       25
        4.551020
       45.944444
       3.666667
        8.166667
       1.777778
       0.000000
       10.388889
       6.055556
      ...
       4.166667
       6.111111
       8.666667
       2.166667
       5.222222
       4.388889
       2.611111
       2.166667
       5.944444
        4.222222
    
    
      4
       177
       25
       33.265306
       45.147059
       9.411765
       22.441176
       1.676471
       0.000000
        0.705882
       2.735294
      ...
       4.970588
       4.470588
       3.823529
       2.176471
       4.235294
       3.558824
       1.147059
       4.470588
       2.441176
       18.794118
    
  

5 rows × 23 columns



In [8]:

    
all_targets.shape









    Out[8]:





(814, 23)



In [ ]:

    
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
features = features[~features.CID.isin(test_CIDs)] # remove test data 

print(targets.shape,features.shape)

   
train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # remove lb_data 
train_features = features[~features.CID.isin(lb_CIDs)]

print(train_targets.shape,train_features.shape)
    
    #feature selection
    
for idx in range(21):
    print('selection for descriptor: ' + descriptor[idx])
    sys.stdout.flush()
    Y = train_targets[descriptor[idx]]
    X = train_features.ix[:,1:]
    selector = RandomizedLasso(alpha=0.025,selection_threshold=0.001,verbose=1,n_jobs=2,n_resampling=200,
                               random_state=12).fit(X,Y)

    scores = pd.DataFrame(selector.scores_,index=X.columns)
    scores.to_csv('LB_scores/scores_'+str(idx)+'.csv')



In [ ]:

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	91541756_2	91552833_2	91563027_2	91595028_2	91614181_2	91617014_2	91617930_2	91618238_2	neglog10d	Intensity
0	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080	1	1
1	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080	3	0
2	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162	5	1
3	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162	7	0
4	177	0.020039	0.067721	0.015501	0.075556	0.083688	0.078074	0.089782	0.106346	0.382979	...	0.000961	0.000339	0.000657	0.000008	0.000098	0.000221	0.000037	0.001932	3	1

	#oID	individual	INTENSITY/STRENGTH	VALENCE/PLEASANTNESS	BAKERY	SWEET	FRUIT	FISH	GARLIC	SPICES	...	ACID	WARM	MUSKY	SWEATY	AMMONIA/URINOUS	DECAYED	WOOD	GRASS	FLOWER	CHEMICAL
0	126	25	49.551020	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
1	126	25	24.653061	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
2	176	25	11.551020	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
3	176	25	4.551020	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
4	177	25	33.265306	45.147059	9.411765	22.441176	1.676471	0.000000	0.705882	2.735294	...	4.970588	4.470588	3.823529	2.176471	4.235294	3.558824	1.147059	4.470588	2.441176	18.794118