In [1]:
import pandas as pd
import numpy as np
import os
In [2]:
# load data ordered by Russ
train = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_train.csv")).sort('CID')
LB = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_lb.csv")).sort('CID')
test = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_test.csv")).sort('CID')
russdata = pd.concat((train,LB,test),ignore_index=1)
russdata.sort(['CID','Intensity'],inplace=1)
russdata.index = range(len(russdata))
russdata.head()
Out[2]:
In [3]:
# load the training data
data = pd.read_csv(os.path.abspath('__file__' + "/../../../data/TrainSet.txt"),sep='\t')
In [4]:
data.drop(['Intensity','Odor','Replicate','Dilution'],axis=1, inplace=1)
data.columns = ['#oID', 'individual'] + list(data.columns)[2:]
data.head()
Out[4]:
In [5]:
# load leaderboard data and reshape them to match the training data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()
Out[5]:
In [6]:
# load leaderboard low intensity data and reshape them to match the training data
LB_data_low = pd.read_csv(os.path.abspath('__file__' + "/../../../data/leaderboard_set_Low_Intensity.txt"),sep='\t')
LB_data_low = LB_data_low.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_low.reset_index(level=[0,1],inplace=1)
LB_data_low.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_low = LB_data_low[data.columns]
LB_data_low.head()
Out[6]:
In [10]:
# put them all together
selection_data = pd.concat((data,LB_data_high,LB_data_low),ignore_index=True)
# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']:
selection_data[descriptor][(selection_data['INTENSITY/STRENGTH'] == 0)] = np.nan
In [11]:
selection_data = selection_data.groupby('#oID').mean()
selection_data.reset_index(inplace=1)
selection_data.drop('individual',1,inplace=1)
selection_data.head()
Out[11]:
In [12]:
selection_data = pd.concat((selection_data,selection_data)).sort('#oID')
selection_data.index=range(len(selection_data))
# load test CIDs to remove them from russdata
with open(os.path.abspath('__file__' + "/../../../data/CID_testset.txt")) as f:
content = f.readlines()
test_CIDs = list(content)
test_CIDs = [int(x) for x in test_CIDs]
intensity = russdata[~russdata.CID.isin(test_CIDs)]
intensity.index = range(len(intensity))
print((intensity.CID != selection_data['#oID']).sum())
intensity = intensity['INTENSITY/STRENGTH']
selection_data['INTENSITY/STRENGTH'] = intensity
selection_data.head()
Out[12]:
In [13]:
selection_data.shape
Out[13]:
In [14]:
selection_data.to_csv('targets_for_feature_selection.csv')
selection_data.head()
Out[14]:
In [30]:
# load the train data
data = pd.read_csv(os.path.abspath('__file__' + "/../../../data/TrainSet.txt"),sep='\t')
data.drop(['Odor','Replicate'],axis=1, inplace=1)
data.columns = [u'#oID','Intensity','Dilution', u'individual', u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS', u'FLOWER', u'CHEMICAL']
data.head()
Out[30]:
In [31]:
#load LB data
LB_data_high = pd.read_csv(os.path.abspath('__file__' + "/../../../data/LBs1.txt"),sep='\t')
LB_data_high = LB_data_high.pivot_table(index=['#oID','individual'],columns='descriptor',values='value')
LB_data_high.reset_index(level=[0,1],inplace=1)
LB_data_high.rename(columns={' CHEMICAL':'CHEMICAL'}, inplace=True)
LB_data_high['Dilution'] = '1/1,000 '
LB_data_high['Intensity'] = 'high '
LB_data_high = LB_data_high[data.columns]
LB_data_high.head()
Out[31]:
In [32]:
# put them together
data = pd.concat((data,LB_data_high),ignore_index=True)
# replace descriptor data with np.nan if intensity is zero
for descriptor in [u'VALENCE/PLEASANTNESS', u'BAKERY', u'SWEET', u'FRUIT', u'FISH',
u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT', u'ACID', u'WARM',
u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
u'GRASS', u'FLOWER', u'CHEMICAL']:
data[descriptor][(data['INTENSITY/STRENGTH'] == 0)] = np.nan
In [33]:
# average the duplicates
data = data.groupby(['individual','#oID','Dilution','Intensity']).mean()
data.reset_index(level=[2,3], inplace=True)
# filter out data with high intensity
data = data[data.Intensity == 'high ']
In [34]:
# drop Dilution and Intensity columns
data.drop(['Dilution','Intensity'],inplace=1,axis=1)
data.reset_index(level=[0,1], inplace=True)
In [35]:
data = data.groupby('#oID').mean()
data.shape
Out[35]:
In [36]:
data.head()
Out[36]:
In [37]:
#duplacate the matrix, add neglog10 (Dilution) and Intensity data from russdata
data.reset_index(inplace=1)
data = pd.concat((data,data)).sort('#oID')
data.index=range(len(data))
# remove test targets from russdata intensity (Nans)
intensity = russdata[~russdata.CID.isin(test_CIDs)]
intensity.index = range(len(intensity))
print((intensity.CID != data['#oID']).sum()) # check if data orders are the same - should get 0 if they are the same
intensity = intensity['INTENSITY/STRENGTH']
data['INTENSITY/STRENGTH'] = intensity
data.head()
Out[37]:
In [39]:
data.shape
Out[39]:
In [40]:
#save it
data.to_csv('target.csv')
In [ ]: