calculates the feature scores for each split
In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os
In [4]:
# load lb, test and CV CIDs
# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_leaderboard.txt")) as f:
content = f.readlines()
lb_CIDs = list(content)
lb_CIDs = [int(x) for x in lb_CIDs]
# load test CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_testset.txt")) as f:
content = f.readlines()
test_CIDs = list(content)
test_CIDs = [int(x) for x in test_CIDs]
In [5]:
features = pd.read_csv('features.csv')
features.head()
Out[5]:
In [9]:
trainsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_train_big.csv"),header=None)
testsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_test_big.csv"),header=None)
In [10]:
features.shape
Out[10]:
In [11]:
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY',
u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']):
descriptor[idx] = desc
In [13]:
targets = pd.read_csv('targets_for_feature_selection.csv')
targets.columns = ['CID'] + list(targets.columns.values[1:])
targets.head()
Out[13]:
In [14]:
targets.head()
Out[14]:
In [ ]:
for k in range(10):
# set a cv split as holdout data.
lb_CIDs = testsplits.ix[k,:].values
features = features[~features.CID.isin(test_CIDs)] # remove test data features
#print(targets.shape,features.shape)
train_targets = targets[~targets['CID'].isin(lb_CIDs)] # remove lb_data
train_features = features[~features.CID.isin(lb_CIDs)] # remove lb_data
#print(train_targets.shape,train_features.shape)
#feature selection
for idx in range(21):
if k < 0 and idx < 0: # in case the selection stops at a point, set the right numbers to continue (takes time)
pass
else:
print('split ' + str(k))
print('selection for descriptor: ' + descriptor[idx])
sys.stdout.flush()
Y = train_targets[descriptor[idx]]
X = train_features.ix[:,1:]
selector = RandomizedLasso(alpha=0.025,selection_threshold=0.001,verbose=1,n_resampling=200,
random_state=12).fit(X,Y)
scores = pd.DataFrame(selector.scores_,index=X.columns)
scores.to_csv('scores/LB_scores_morgan' + str(k) + '/scores_'+str(idx)+'.csv')
In [ ]: