In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# load data ordered by Russ
train = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_train.csv")).sort('CID')
LB = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_lb.csv")).sort('CID')
test = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_test.csv")).sort('CID')
russdata = pd.concat((train,LB,test),ignore_index=1)
russdata.sort(['CID','Intensity'],inplace=1)
russdata.index = range(len(russdata))
russdata.head()


Out[2]:
CID source SMILES neglog10d Dilution Intensity INTENSITY/STRENGTH VALENCE/PLEASANTNESS bakery sweet ... name467 name468 name469 name470 name471 name472 name473 name474 name475 name476
0 126 train OC1=CC=C(C=O)C=C1 1 1/10 high 49.551020 48.956522 0.630435 24.347826 ... 0.008785 -0.007208 -0.004686 0.002605 -0.001857 0.005821 -0.004882 0.003520 -0.010491 0.000221
1 126 train OC1=CC=C(C=O)C=C1 3 1/1,000 low 24.653061 51.058824 0.411765 16.676471 ... 0.008785 -0.007208 -0.004686 0.002605 -0.001857 0.005821 -0.004882 0.003520 -0.010491 0.000221
2 176 train C(C)(=O)O 5 1/100,000 high 11.551020 48.461539 2.538462 6.692308 ... -0.002969 0.000474 0.001067 -0.004214 0.004671 0.006453 0.003067 -0.003971 -0.005304 0.001662
3 176 train C(C)(=O)O 7 1/10,000,000 low 4.551020 47.250000 0.562500 2.000000 ... -0.002969 0.000474 0.001067 -0.004214 0.004671 0.006453 0.003067 -0.003971 -0.005304 0.001662
4 177 train C(C)=O 3 1/1,000 high 33.265306 45.315790 8.421053 20.078947 ... 0.000275 -0.003037 0.001442 -0.003658 -0.005168 0.006038 -0.002373 0.000455 -0.000303 -0.003977

5 rows × 11731 columns

target data for feature selection

average all data for each compound

In [3]:
# load the training data 
data = pd.read_csv(os.path.abspath('__file__' + "/../../../data/TrainSet.txt"),sep='\t')

In [4]:
data.drop(['Intensity','Odor','Replicate','Dilution'],axis=1, inplace=1)
data.columns = ['#oID', 'individual'] + list(data.columns)[2:]
data.head()


Out[4]:
#oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 1 7 62 0 0 0 0 0 0 ... 0 0 0 21 0 0 0 0 0 0
1 126 1 37 60 0 72 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 126 2 55 89 0 33 1 0 0 3 ... 0 0 0 0 0 0 0 0 0 5
3 126 2 64 71 0 9 2 0 0 11 ... 0 0 0 0 0 0 0 0 0 7
4 126 3 89 68 0 62 0 0 0 35 ... 0 62 0 0 0 0 0 0 0 0

5 rows × 23 columns


In [5]:
# load leaderboard data and reshape them to match the training data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()


Out[5]:
descriptor #oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 243 1 73 14 0 0 0 0 0 0 ... 0 0 0 0 0 49 0 0 0 0
1 243 2 0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 243 3 0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 243 4 2 89 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 0
4 243 5 61 31 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 23 columns


In [6]:
# load leaderboard low intensity data and reshape them to match the training data
LB_data_low = pd.read_csv(os.path.abspath('__file__' + "/../../../data/leaderboard_set_Low_Intensity.txt"),sep='\t')
LB_data_low = LB_data_low.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_low.reset_index(level=[0,1],inplace=1)
LB_data_low.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_low = LB_data_low[data.columns]
LB_data_low.head()


Out[6]:
descriptor #oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 243 1 22 78 0 76 73 0 0 76 ... 0 0 0 0 0 0 0 0 0 0
1 243 2 39 35 0 0 0 0 0 5 ... 0 0 0 0 7 0 0 0 0 8
2 243 3 13 70 0 52 0 0 0 0 ... 0 46 0 0 0 0 0 0 40 0
3 243 4 12 87 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 83 0
4 243 5 0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 23 columns


In [10]:
# put them all together
selection_data = pd.concat((data,LB_data_high,LB_data_low),ignore_index=True)

# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
      u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
      u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
      u'GRASS', u'FLOWER', u'CHEMICAL']:
    selection_data[descriptor][(selection_data['INTENSITY/STRENGTH'] == 0)] = np.nan

In [11]:
selection_data = selection_data.groupby('#oID').mean()
selection_data.reset_index(inplace=1)
selection_data.drop('individual',1,inplace=1)
selection_data.head()


Out[11]:
descriptor #oID INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 37.102041 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
1 176 8.051020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
2 177 22.387755 48.418182 9.363636 19.781818 3.000000 0.763636 1.254545 2.472727 6.709091 ... 3.563636 3.218182 6.218182 1.945455 2.727273 3.872727 0.727273 3.454545 4.090909 14.200000
3 196 14.530612 44.304348 1.304348 9.804348 0.913043 0.500000 3.239130 7.108696 2.152174 ... 5.543478 6.695652 9.043478 7.304348 2.152174 4.217391 1.195652 1.543478 6.695652 7.847826
4 239 24.683673 51.724138 1.362069 13.500000 4.293103 1.482759 4.534483 6.189655 4.965517 ... 3.241379 5.068966 6.534483 0.793103 0.931034 5.413793 3.120690 5.775862 9.396552 10.862069

5 rows × 22 columns


In [12]:
selection_data = pd.concat((selection_data,selection_data)).sort('#oID')
selection_data.index=range(len(selection_data))

# load test CIDs to remove them from russdata
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]


intensity = russdata[~russdata.CID.isin(test_CIDs)]

intensity.index = range(len(intensity))

print((intensity.CID != selection_data['#oID']).sum())

intensity = intensity['INTENSITY/STRENGTH']

selection_data['INTENSITY/STRENGTH'] = intensity
selection_data.head()


0
Out[12]:
descriptor #oID INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 49.551020 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
1 126 24.653061 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
2 176 11.551020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
3 176 4.551020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
4 177 33.265306 48.418182 9.363636 19.781818 3.000000 0.763636 1.254545 2.472727 6.709091 ... 3.563636 3.218182 6.218182 1.945455 2.727273 3.872727 0.727273 3.454545 4.090909 14.200000

5 rows × 22 columns


In [13]:
selection_data.shape


Out[13]:
(814, 22)

In [14]:
selection_data.to_csv('targets_for_feature_selection.csv')
selection_data.head()


Out[14]:
descriptor #oID INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 49.551020 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
1 126 24.653061 50.081081 0.500000 21.959459 7.405405 0.175676 2.162162 4.554054 4.662162 ... 4.094595 2.486486 7.216216 1.391892 2.554054 4.675676 0.891892 1.662162 8.094595 15.283784
2 176 11.551020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
3 176 4.551020 45.344828 2.275862 5.103448 1.137931 0.000000 6.448276 5.965517 4.793103 ... 3.896552 5.448276 6.448276 3.551724 3.275862 4.275862 2.413793 2.482759 6.724138 7.724138
4 177 33.265306 48.418182 9.363636 19.781818 3.000000 0.763636 1.254545 2.472727 6.709091 ... 3.563636 3.218182 6.218182 1.945455 2.727273 3.872727 0.727273 3.454545 4.090909 14.200000

5 rows × 22 columns

target data for training

filter out the relevant data for each compound

In [30]:
# load the train data 
data = pd.read_csv(os.path.abspath('__file__' + "/../../../data/TrainSet.txt"),sep='\t')

data.drop(['Odor','Replicate'],axis=1, inplace=1)
data.columns = [u'#oID','Intensity','Dilution', u'individual', u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS', u'FLOWER', u'CHEMICAL']
data.head()


Out[30]:
#oID Intensity Dilution individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 low 1/1,000 1 7 62 0 0 0 0 ... 0 0 0 21 0 0 0 0 0 0
1 126 high 1/10 1 37 60 0 72 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 126 low 1/1,000 2 55 89 0 33 1 0 ... 0 0 0 0 0 0 0 0 0 5
3 126 high 1/10 2 64 71 0 9 2 0 ... 0 0 0 0 0 0 0 0 0 7
4 126 low 1/1,000 3 89 68 0 62 0 0 ... 0 62 0 0 0 0 0 0 0 0

5 rows × 25 columns


In [31]:
#load LB data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high['Dilution'] = '1/1,000 '
LB_data_high['Intensity'] = 'high '
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()


Out[31]:
descriptor #oID Intensity Dilution individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 243 high 1/1,000 1 73 14 0 0 0 0 ... 0 0 0 0 0 49 0 0 0 0
1 243 high 1/1,000 2 0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 243 high 1/1,000 3 0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 243 high 1/1,000 4 2 89 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 0
4 243 high 1/1,000 5 61 31 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 25 columns


In [32]:
# put them together
data = pd.concat((data,LB_data_high),ignore_index=True)
# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
      u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
      u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
      u'GRASS', u'FLOWER', u'CHEMICAL']:
    data[descriptor][(data['INTENSITY/STRENGTH'] == 0)] = np.nan

In [33]:
# average the duplicates 
data = data.groupby(['individual','#oID','Dilution','Intensity']).mean() 
data.reset_index(level=[2,3], inplace=True) 

# filter out data with high intensity
data = data[data.Intensity == 'high ']

In [34]:
# drop Dilution and Intensity columns
data.drop(['Dilution','Intensity'],inplace=1,axis=1)
data.reset_index(level=[0,1], inplace=True)

In [35]:
data = data.groupby('#oID').mean()
data.shape


Out[35]:
(407, 22)

In [36]:
data.head()


Out[36]:
descriptor individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES COLD ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
#oID
126 25 49.551020 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 4.116279 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
176 25 11.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 7.666667 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
177 25 33.265306 45.147059 9.411765 22.441176 1.676471 0.000000 0.705882 2.735294 7.735294 ... 4.970588 4.470588 3.823529 2.176471 4.235294 3.558824 1.147059 4.470588 2.441176 18.794118
196 25 22.183673 51.000000 2.000000 15.000000 1.500000 0.000000 2.178571 7.035714 3.107143 ... 3.678571 7.642857 7.750000 6.964286 3.178571 5.071429 1.357143 2.535714 11.000000 7.392857
239 25 28.204082 51.411765 0.735294 8.705882 2.558824 1.088235 2.558824 2.617647 7.823529 ... 4.235294 4.264706 4.705882 0.588235 0.205882 3.617647 5.323529 7.470588 6.294118 11.411765

5 rows × 22 columns


In [37]:
#duplacate the matrix, add neglog10 (Dilution) and Intensity data from russdata
data.reset_index(inplace=1)
data = pd.concat((data,data)).sort('#oID')
data.index=range(len(data))

# remove test targets from russdata intensity (Nans)
intensity = russdata[~russdata.CID.isin(test_CIDs)]

intensity.index = range(len(intensity))

print((intensity.CID != data['#oID']).sum()) # check if data orders are the same - should get 0 if they are the same

intensity = intensity['INTENSITY/STRENGTH']

data['INTENSITY/STRENGTH'] = intensity
data.head()


0
Out[37]:
descriptor #oID individual INTENSITY/STRENGTH VALENCE/PLEASANTNESS BAKERY SWEET FRUIT FISH GARLIC SPICES ... ACID WARM MUSKY SWEATY AMMONIA/URINOUS DECAYED WOOD GRASS FLOWER CHEMICAL
0 126 25 49.551020 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
1 126 25 24.653061 49.465116 0.674419 25.953488 6.581395 0.302326 1.720930 3.906977 ... 3.046512 0.790698 8.023256 1.604651 1.209302 5.069767 1.348837 1.441860 9.906977 14.813953
2 176 25 11.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
3 176 25 4.551020 45.944444 3.666667 8.166667 1.777778 0.000000 10.388889 6.055556 ... 4.166667 6.111111 8.666667 2.166667 5.222222 4.388889 2.611111 2.166667 5.944444 4.222222
4 177 25 33.265306 45.147059 9.411765 22.441176 1.676471 0.000000 0.705882 2.735294 ... 4.970588 4.470588 3.823529 2.176471 4.235294 3.558824 1.147059 4.470588 2.441176 18.794118

5 rows × 23 columns


In [39]:
data.shape


Out[39]:
(814, 23)

In [40]:
#save it
data.to_csv('target.csv')

In [ ]: