averages the test results from the 10 splits
and saves the results in form of LBsx_2.txt
similarly to the LBs2.txt file


In [1]:
import pandas as pd
import numpy as np

In [7]:
testset = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/TrainSet.txt',
                     sep='\t')
lb_set = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/LBs2.txt',
                     sep='\t')

In [ ]:


In [9]:
lb_set.head()


Out[9]:
#oID descriptor value sigma
0 243 INTENSITY/STRENGTH 16.6327 25.5610
1 454 INTENSITY/STRENGTH 65.5102 27.7422
2 679 INTENSITY/STRENGTH 18.4286 28.3012
3 1030 INTENSITY/STRENGTH 3.8980 9.7345
4 1060 INTENSITY/STRENGTH 67.8980 26.3195

In [25]:
trainsplits = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/cv_splits_train_big.csv',header=None)
testsplits = pd.read_csv('/media/gabor/C:/Users/gabor/Documents/final_ofaction_for_paper_2/data/cv_splits_test_big.csv',header=None)

In [11]:
testset.head()


Out[11]:
Compound Identifier Odor Replicate Intensity Dilution subject # INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 4-Hydroxybenzaldehyde NaN low 1/1,000 1 7 62 0 0 ... 0 0 0 21 0 0 0 0 0 0
1 126 4-Hydroxybenzaldehyde NaN high 1/10 1 37 60 0 72 ... 0 0 0 0 0 0 0 0 0 0
2 126 4-Hydroxybenzaldehyde NaN low 1/1,000 2 55 89 0 33 ... 0 0 0 0 0 0 0 0 0 5
3 126 4-Hydroxybenzaldehyde NaN high 1/10 2 64 71 0 9 ... 0 0 0 0 0 0 0 0 0 7
4 126 4-Hydroxybenzaldehyde NaN low 1/1,000 3 89 68 0 62 ... 0 62 0 0 0 0 0 0 0 0

5 rows × 27 columns


In [12]:
OID = testset['Compound Identifier'].unique()

In [18]:
data = []
for descriptor in [u'INTENSITY/STRENGTH',
       u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
       u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
       u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS',
       u'FLOWER', u'CHEMICAL']:
    print descriptor
    for oid in OID:
        if descriptor == 'INTENSITY/STRENGTH':
            selection = testset[testset['Compound Identifier']==oid]
            selection = selection[selection.Dilution == '1/1,000 ']
            mean = selection[descriptor].mean()
            std = selection[descriptor].std()
        else:
            selection = testset[testset['Compound Identifier']==oid]
            selection = selection[selection.Intensity == 'high ']
            mean = selection[descriptor].mean()
            std = selection[descriptor].std()
        if len(selection)>0:
            data.append([oid,descriptor,mean,std])


INTENSITY/STRENGTH
VALENCE/PLEASANTNESS
BAKERY
SWEET
FRUIT
FISH
GARLIC
SPICES
COLD
SOUR
BURNT
ACID
WARM
MUSKY
SWEATY
AMMONIA/URINOUS
DECAYED
WOOD
GRASS
FLOWER
CHEMICAL

In [20]:
data = pd.DataFrame(data)
data.columns = lb_set.columns
data.shape,data.head()


Out[20]:
((7040, 4),    #oID          descriptor      value      sigma
 0   126  INTENSITY/STRENGTH  24.653061  27.807037
 1   177  INTENSITY/STRENGTH  33.265306  32.947417
 2   196  INTENSITY/STRENGTH   6.877551  13.541037
 3   239  INTENSITY/STRENGTH  21.163265  26.534056
 4   240  INTENSITY/STRENGTH  77.489796  17.239879)

In [21]:
data=pd.concat((data,lb_set),ignore_index=1)

In [22]:
data.shape


Out[22]:
(8489, 4)

In [23]:
data.head()


Out[23]:
#oID descriptor value sigma
0 126 INTENSITY/STRENGTH 24.653061 27.807037
1 177 INTENSITY/STRENGTH 33.265306 32.947417
2 196 INTENSITY/STRENGTH 6.877551 13.541037
3 239 INTENSITY/STRENGTH 21.163265 26.534056
4 240 INTENSITY/STRENGTH 77.489796 17.239879

In [28]:
for k in range(10):
    # save splits cv data as LBsx_2
    lb_CIDs = testsplits.ix[k,:].values
    lb_data = data[data['#oID'].isin(lb_CIDs)] 
    print len(lb_data)
    lb_data.to_csv('LBs'+str(k)+'_2',sep='\t',index=False)


1441
1441
1443
1441
1439
1437
1438
1442
1442
1444

In [ ]: