prepares the target matrix with average values
separate target file for the selection and for the training
for feature selection we take the averages
for training we select the right values (1/1000 dilution or 'high')


In [1]:
import pandas as pd
import numpy as np
import os

target data for feature selection

average all data for each compound

In [3]:
# load the training data 
data = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/TrainSet.txt"),sep='\t')

In [4]:
data.drop(['Intensity','Odor','Replicate','Dilution'],axis=1, inplace=1)
data.columns = ['#oID', 'individual'] + list(data.columns)[2:]
data.head()


Out[4]:
#oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 1 7 62.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 21.0 0.0 0.0 0.0 0.0 0.0 0.0
1 126 1 37 60.0 0.0 72.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 126 2 55 89.0 0.0 33.0 1.0 0.0 0.0 3.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0
3 126 2 64 71.0 0.0 9.0 2.0 0.0 0.0 11.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
4 126 3 89 68.0 0.0 62.0 0.0 0.0 0.0 35.0 ... 0.0 62.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 23 columns


In [9]:
# load leaderboard data and reshape them to match the training data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()


Out[9]:
descriptor #oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 243 1 73.0 14.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 49.0 0.0 0.0 0.0 0.0
1 243 2 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 243 3 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 243 4 2.0 89.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 84.0 0.0
4 243 5 61.0 31.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 23 columns


In [11]:
# load leaderboard low intensity data and reshape them to match the training data
LB_data_low = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/leaderboard_set_Low_Intensity.txt"),sep='\t')
LB_data_low = LB_data_low.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_low.reset_index(level=[0,1],inplace=1)
LB_data_low.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_low = LB_data_low[data.columns]
LB_data_low.head()


Out[11]:
descriptor #oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 243 1 22.0 78.0 0.0 76.0 73.0 0.0 0.0 76.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 243 2 39.0 35.0 0.0 0.0 0.0 0.0 0.0 5.0 ... 0.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 8.0
2 243 3 13.0 70.0 0.0 52.0 0.0 0.0 0.0 0.0 ... 0.0 46.0 0.0 0.0 0.0 0.0 0.0 0.0 40.0 0.0
3 243 4 12.0 87.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 83.0 0.0
4 243 5 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 23 columns


In [31]:
# put them all together
selection_data = pd.concat((data,LB_data_high,LB_data_low),ignore_index=True)

# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
      u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
      u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
      u'GRASS', u'FLOWER', u'CHEMICAL']:
    selection_data.loc[(selection_data['INTENSITY/STRENGTH'] == 0),descriptor] = np.nan

In [32]:
#average them all
selection_data = selection_data.groupby('#oID').mean()
selection_data.drop('individual',1,inplace=1)
selection_data.to_csv('targets_for_feature_selection.csv')
selection_data.head()


Out[32]:
INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD SOUR ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
#oID
126 37.102041 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 5.459459 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
176 8.051020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 13.517241 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
177 22.387755 48.418182 9.363636 19.781818 3.000000 0.763636 1.254545 2.472727 6.709091 8.800000 ... 3.563636 3.218182 6.218182 1.945455 2.727273 3.872727 0.727273 3.454545 4.090909 14.200000
196 14.530612 44.304348 1.304348 9.804348 0.913043 0.500000 3.239130 7.108696 2.152174 11.152174 ... 5.543478 6.695652 9.043478 7.304348 2.152174 4.217391 1.195652 1.543478 6.695652 7.847826
239 24.683673 51.724138 1.362069 13.500000 4.293103 1.482759 4.534483 6.189655 4.965517 9.241379 ... 3.241379 5.068966 6.534483 0.793103 0.931034 5.413793 3.120690 5.775862 9.396552 10.862069

5 rows × 21 columns

target data for training

filter out the relevant data for each compound

In [12]:
# load the train data 
data = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/TrainSet.txt"),sep='\t')

data.drop(['Odor','Replicate'],axis=1, inplace=1)
data.columns = [u'#oID','Intensity','Dilution', u'individual', u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS', u'FLOWER', u'CHEMICAL']
data.head()


Out[12]:
#oID Intensity Dilution individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 low 1/1,000 1 7 62.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 21.0 0.0 0.0 0.0 0.0 0.0 0.0
1 126 high 1/10 1 37 60.0 0.0 72.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 126 low 1/1,000 2 55 89.0 0.0 33.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0
3 126 high 1/10 2 64 71.0 0.0 9.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
4 126 low 1/1,000 3 89 68.0 0.0 62.0 0.0 0.0 ... 0.0 62.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 25 columns


In [13]:
#load LB data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high['Dilution'] = '1/1,000 '
LB_data_high['Intensity'] = 'high '
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()


Out[13]:
descriptor #oID Intensity Dilution individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 243 high 1/1,000 1 73.0 14.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 49.0 0.0 0.0 0.0 0.0
1 243 high 1/1,000 2 0.0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 243 high 1/1,000 3 0.0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 243 high 1/1,000 4 2.0 89.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 84.0 0.0
4 243 high 1/1,000 5 61.0 31.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 25 columns


In [14]:
# put them together
data = pd.concat((data,LB_data_high),ignore_index=True)
# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
      u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
      u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
      u'GRASS', u'FLOWER', u'CHEMICAL']:
    data.loc[(data['INTENSITY/STRENGTH'] == 0),descriptor] = np.nan

In [15]:
# average the duplicates 
data = data.groupby(['individual','#oID','Dilution','Intensity']).mean() 
data.reset_index(level=[2,3], inplace=True) 

#filter out data for intensity prediction
data_int = data[data.Dilution == '1/1,000 ']

# filter out data for everything else
data = data[data.Intensity == 'high ']

In [18]:
# replace the Intensity data with the data_int intensity values
data['INTENSITY/STRENGTH'] = data_int['INTENSITY/STRENGTH']
data.drop(['Dilution','Intensity'],inplace=1,axis=1)
data.reset_index(level=[0,1], inplace=True)

In [19]:
data.head()


Out[19]:
individual #oID INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 1 126 7.0 60.0 0.0 72.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1 176 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1 177 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1 196 0.0 79.0 0.0 72.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 1 239 21.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 23 columns


In [21]:
data = data.groupby('#oID').mean()

In [22]:
data.shape


Out[22]:
(407, 22)

In [23]:
#save it
data.to_csv('target.csv')

In [ ]: