averages the test results from the 10 splits
and saves the results in form of LBsx_2.txt
similarly to the LBs2.txt file
In [1]:
import pandas as pd
import numpy as np
In [7]:
testset = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/TrainSet.txt',
sep='\t')
lb_set = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/LBs2.txt',
sep='\t')
In [ ]:
In [9]:
lb_set.head()
Out[9]:
In [25]:
trainsplits = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/cv_splits_train_big.csv',header=None)
testsplits = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/cv_splits_test_big.csv',header=None)
In [11]:
testset.head()
Out[11]:
In [12]:
OID = testset['Compound Identifier'].unique()
In [18]:
data = []
for descriptor in [u'INTENSITY/STRENGTH',
u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS',
u'FLOWER', u'CHEMICAL']:
print descriptor
for oid in OID:
if descriptor == 'INTENSITY/STRENGTH':
selection = testset[testset['Compound Identifier']==oid]
selection = selection[selection.Dilution == '1/1,000 ']
mean = selection[descriptor].mean()
std = selection[descriptor].std()
else:
selection = testset[testset['Compound Identifier']==oid]
selection = selection[selection.Intensity == 'high ']
mean = selection[descriptor].mean()
std = selection[descriptor].std()
if len(selection)>0:
data.append([oid,descriptor,mean,std])
In [20]:
data = pd.DataFrame(data)
data.columns = lb_set.columns
data.shape,data.head()
Out[20]:
In [21]:
data=pd.concat((data,lb_set),ignore_index=1)
In [22]:
data.shape
Out[22]:
In [23]:
data.head()
Out[23]:
In [28]:
for k in range(10):
# save splits cv data as LBsx_2
lb_CIDs = testsplits.ix[k,:].values
lb_data = data[data['#oID'].isin(lb_CIDs)]
print len(lb_data)
lb_data.to_csv('LBs'+str(k)+'_2',sep='\t',index=False)
In [ ]: