prepares the target matrix with average values
separate target file for the selection and for the training
for feature selection we take the averages
for training we select the right values (1/1000 dilution or 'high')
In [1]:
import pandas as pd
import numpy as np
import os
In [3]:
# load the training data
data = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/TrainSet.txt"),sep='\t')
In [4]:
data.drop(['Intensity','Odor','Replicate','Dilution'],axis=1, inplace=1)
data.columns = ['#oID', 'individual'] + list(data.columns)[2:]
data.head()
Out[4]:
In [9]:
# load leaderboard data and reshape them to match the training data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()
Out[9]:
In [11]:
# load leaderboard low intensity data and reshape them to match the training data
LB_data_low = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/leaderboard_set_Low_Intensity.txt"),sep='\t')
LB_data_low = LB_data_low.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_low.reset_index(level=[0,1],inplace=1)
LB_data_low.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_low = LB_data_low[data.columns]
LB_data_low.head()
Out[11]:
In [31]:
# put them all together
selection_data = pd.concat((data,LB_data_high,LB_data_low),ignore_index=True)
# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']:
selection_data.loc[(selection_data['INTENSITY/STRENGTH'] == 0),descriptor] = np.nan
In [32]:
#average them all
selection_data = selection_data.groupby('#oID').mean()
selection_data.drop('individual',1,inplace=1)
selection_data.to_csv('targets_for_feature_selection.csv')
selection_data.head()
Out[32]:
In [12]:
# load the train data
data = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/TrainSet.txt"),sep='\t')
data.drop(['Odor','Replicate'],axis=1, inplace=1)
data.columns = [u'#oID','Intensity','Dilution', u'individual', u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS', u'FLOWER', u'CHEMICAL']
data.head()
Out[12]:
In [13]:
#load LB data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high['Dilution'] = '1/1,000 '
LB_data_high['Intensity'] = 'high '
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()
Out[13]:
In [14]:
# put them together
data = pd.concat((data,LB_data_high),ignore_index=True)
# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']:
data.loc[(data['INTENSITY/STRENGTH'] == 0),descriptor] = np.nan
In [15]:
# average the duplicates
data = data.groupby(['individual','#oID','Dilution','Intensity']).mean()
data.reset_index(level=[2,3], inplace=True)
#filter out data for intensity prediction
data_int = data[data.Dilution == '1/1,000 ']
# filter out data for everything else
data = data[data.Intensity == 'high ']
In [18]:
# replace the Intensity data with the data_int intensity values
data['INTENSITY/STRENGTH'] = data_int['INTENSITY/STRENGTH']
data.drop(['Dilution','Intensity'],inplace=1,axis=1)
data.reset_index(level=[0,1], inplace=True)
In [19]:
data.head()
Out[19]:
In [21]:
data = data.groupby('#oID').mean()
In [22]:
data.shape
Out[22]:
In [23]:
#save it
data.to_csv('target.csv')
In [ ]: