In [1]:
import numpy as np
import pandas as pd
import os,sys
from sklearn import linear_model
from scipy import stats as stats
In [2]:
# load lb, test and CV CIDs
# load LB CIDs
with open('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/CID_leaderboard.txt') as f:
content = f.readlines()
lb_CIDs = list(content)
lb_CIDs = [int(x) for x in lb_CIDs]
# load test CIDs
with open("/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/CID_testset.txt") as f:
content = f.readlines()
test_CIDs = list(content)
test_CIDs = [int(x) for x in test_CIDs]
In [4]:
# load morgan matrix to use them as weights in training
morgan = pd.read_csv('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/morgan_sim.csv', index_col=0)
weights = morgan[morgan.index.astype(str)]
weights = pd.concat((weights,weights)).sort_index()
print weights.shape
weights.head()
Out[4]:
In [3]:
#load the features
features = pd.read_csv('features_dragon_morgan.csv', index_col=0)
features.head()
Out[3]:
In [46]:
]
In [7]:
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY',
u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']):
descriptor[idx] = desc
In [8]:
# load the targets
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()
Out[8]:
In [9]:
scores = pd.read_csv('LB_scores/scores_' + str(0) + '.csv',index_col=0)
In [9]:
#load splits
trainsplits = pd.read_csv('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/cv_splits_train_big.csv',header=None)
testsplits = pd.read_csv('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/cv_splits_test_big.csv',header=None)
In [20]:
# predict LB with different number of features
for k in range(10):
print k
# set a cv split as holout data
lb_CIDs = testsplits.ix[k,:].values
#for feature_number in [5,10,20,50,100,200,300,500,1000,1500,2000,3000,4000]:
#for feature_number in [1,2,3,5,10,33,100,333,1000,3333,10000]:
for feature_number in [1,2,3,4,5,10,33,100,333,1000,3333,10000]: #new run, with 1,2,3 features
print(feature_number)
sys.stdout.flush()
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data
features = features[~features.CID.isin(test_CIDs)] # remove test data
train_targets = targets[~targets['#oID'].isin(lb_CIDs)] # exclude lb targets from training
train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
test_features = features[features.CID.isin(lb_CIDs)]
# set the regressor
regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')
result = []
for idx in range(21):
#print(descriptor[idx])
# load the scores for the descriptor
scores = pd.read_csv('LB_scores_morgan' + str(k) + '/scores_' + str(idx) + '.csv',index_col=0)
#exclude the data leak
scores = scores.loc[[x for x in scores.index if x not in['Intensity','neglog10d']] ].sort_values(by='0', ascending=0)
X_all = train_features[scores.sort_values(by='0',ascending=0)[:feature_number].index] # set X values with the best features
X_all['CID'] = train_features.CID # add the CIDs as a column
for CID in lb_CIDs:
Y_train = train_targets[['#oID',descriptor[idx]]]
Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
X = X_all[X_all.CID.isin(Y_train['#oID'])]
weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)]
if idx == 0: # if predicting intensity, use 1/1000 dilutions (neglog10 of 1/1000 is 3)
test_data = test_features[test_features.neglog10d == 3]
test_data = test_data[test_data.CID == CID]
test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
else: # otherwise use high dilution data (not that they differ in this target matrix from the low ones)
test_data = test_features[test_features.Intensity == 1]
test_data = test_data[test_data.CID == CID]
test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
# in case the data frame lenght is zero, dont try to predict
if len(test_data) == 0:
print 'zero data',CID
else:
regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
Y_test = regr.predict(test_data)
std = -(Y_test**2)/2500.0+Y_test/25.0
result.append([CID, descriptor[idx], Y_test,std])
result = pd.DataFrame(result)
result.columns = ['#oID', 'descriptor', 'value', 'sigma']
result.value = result.value.astype(float)
result.sigma = result.sigma.astype(float)
# remove negative data and data above 100
result.value[result.value < 0] = 0
result.value[result.value > 100] = 100
result.sigma[result.sigma < 0] = 0
#result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
result.to_csv('results_morgan_noleak/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)
In [29]:
test_features[test_features.CID == CID]
Out[29]:
In [22]:
test_data = test_features[test_features.neglog10d == 3]
#test_data = test_data[test_data.CID == CID]
#test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
test_data
Out[22]:
In [16]:
CID
Out[16]:
In [ ]: