In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os
In [3]:
# load lb, test and CV CIDs
# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_leaderboard.txt")) as f:
content = f.readlines()
lb_CIDs = list(content)
lb_CIDs = [int(x) for x in lb_CIDs]
# load test CIDs
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f:
content = f.readlines()
test_CIDs = list(content)
test_CIDs = [int(x) for x in test_CIDs]
In [4]:
features = pd.read_csv('features.csv', index_col=0)
features.head()
Out[4]:
In [5]:
features.shape
Out[5]:
In [6]:
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY',
u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']):
descriptor[idx] = desc
In [7]:
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()
Out[7]:
In [8]:
all_targets.shape
Out[8]:
In [ ]:
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data
features = features[~features.CID.isin(test_CIDs)] # remove test data
print(targets.shape,features.shape)
train_targets = targets[~targets['#oID'].isin(lb_CIDs)] # remove lb_data
train_features = features[~features.CID.isin(lb_CIDs)]
print(train_targets.shape,train_features.shape)
#feature selection
for idx in range(21):
print('selection for descriptor: ' + descriptor[idx])
sys.stdout.flush()
Y = train_targets[descriptor[idx]]
X = train_features.ix[:,1:]
selector = RandomizedLasso(alpha=0.025,selection_threshold=0.001,verbose=1,n_jobs=2,n_resampling=200,
random_state=12).fit(X,Y)
scores = pd.DataFrame(selector.scores_,index=X.columns)
scores.to_csv('LB_scores/scores_'+str(idx)+'.csv')
In [ ]: