In [1]:
import numpy as np
import pandas as pd
import os,sys
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from scipy import stats as stats
In [2]:
# load lb, test and CV CIDs
# load LB CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_leaderboard.txt")) as f:
content = f.readlines()
lb_CIDs = list(content)
lb_CIDs = [int(x) for x in lb_CIDs]
# load test CIDs
with open(os.path.abspath('__file__' + "/../../../../data/CID_testset.txt")) as f:
content = f.readlines()
test_CIDs = list(content)
test_CIDs = [int(x) for x in test_CIDs]
In [3]:
# load morgan matrix to use them as weights in training
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/morgan_sim.csv"), index_col=0)
weights = morgan[morgan.index.astype(str)]
print weights.shape
weights.head()
Out[3]:
In [4]:
#load the features
features = pd.read_csv('features.csv')
features.head()
Out[4]:
In [5]:
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY',
u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']):
descriptor[idx] = desc
In [6]:
# load the targets
all_targets = pd.read_csv('target.csv')
all_targets.head()
Out[6]:
In [7]:
targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data
features = features[~features.CID.isin(test_CIDs)] # remove test data
In [8]:
features['has_int'] = 1- all_targets['INTENSITY/STRENGTH'].isnull().values
features.head()
Out[8]:
In [9]:
#load splits
trainsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_train_big.csv"),header=None)
testsplits = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/cv_splits_test_big.csv"),header=None)
In [ ]:
rfc = RandomForestRegressor(n_estimators=10,max_features='auto',
oob_score=False,n_jobs=1,random_state=0)
regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')
# predict LB with different numbe r of features
for k in range(10): # iterate the splits
print k
# set a cv split as holdout data
lb_CIDs = testsplits.ix[k,:].values
for feature_number in [1,2,4,3,5,10,33,100,333,1000,3333,10000]: # iterate the number of features
print('feature number',feature_number)
sys.stdout.flush()
train_targets = targets[~targets['#oID'].isin(lb_CIDs)] # exclude lb targets from training
train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
test_features = features[features.CID.isin(lb_CIDs)]
# set the regressor
result = []
result_RF = []
for idx in range(21): #iterate through descriptors
#print(descriptor[idx])
# load the scores for the descriptor
scores = pd.read_csv('scores/LB_scores_morgan' + str(k) + '/scores_' + str(idx) + '.csv',index_col=0)
X_all = train_features[scores.sort_values(by='0',ascending=0)[:feature_number].index].copy() # set X values with the best features
X_all['CID'] = train_features.CID # add the CIDs as a column
for CID in lb_CIDs: #iterate through all crossvalidation CID and predict them one by one
Y_train = train_targets[['#oID',descriptor[idx]]]
Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
X = X_all[X_all.CID.isin(Y_train['#oID'])]
weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)] # set the weights for the given CID
if idx == 0: # if predicting intensity, use 1/1000 dilutions has_int == 1
test_data = test_features[test_features.has_int == 1]
test_data = test_data[test_data.CID == CID]
test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
else: # otherwise use high dilution data (already what we have in the matrix)
#test_data = test_features[test_features.Intensity == 1]
test_data = test_features[test_features.CID == CID]
test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
# in case the data frame lenght is zero, dont try to predict (the molecules with no 1/1000 dilutions)
if len(test_data) == 0:
print 'no target at CID ',CID
else:
#linear regression
regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
Y_test = regr.predict(test_data)
std = -(Y_test**2)/2500.0+Y_test/25.0
result.append([CID, descriptor[idx], Y_test,std])
#random forest regression
if (CID ==807) and (idx==0): # CID 807 (I2) has only one molecule with similarity above 0, weights screw up the random forest prediction
print k,idx, CID,'is 807, weights set to zero for random forest intensity prediction for this molecule'
weight.values[:] = 1
rfc.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)
Y_test = rfc.predict(test_data)
std = -(Y_test**2)/2500.0+Y_test/25.0
result_RF.append([CID, descriptor[idx], Y_test,std])
print 'length ', len(result)
result = pd.DataFrame(result)
result.columns = ['#oID', 'descriptor', 'value', 'sigma']
result.value = result.value.astype(float)
result.sigma = result.sigma.astype(float)
result_RF = pd.DataFrame(result_RF)
result_RF.columns = ['#oID', 'descriptor', 'value', 'sigma']
result_RF.value = result_RF.value.astype(float)
result_RF.sigma = result_RF.sigma.astype(float)
# remove negative data and data above 100
result.loc[result.value < 0,'value'] = 0
result.loc[result.value > 100,'value'] = 100
result.loc[result.sigma < 0,'sigma'] = 0
result_RF.loc[result_RF.value < 0,'value'] = 0
result_RF.loc[result_RF.value > 100,'value'] = 100
result_RF.loc[result_RF.sigma < 0,'sigma'] = 0
#result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
result.to_csv('results_morgan/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)
result_RF.to_csv('results_morgan_RF/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)
In [ ]: