In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os

In [2]:
train = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_train.csv")).sort('CID')
LB = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_lb.csv")).sort('CID')
test = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_test.csv")).sort('CID')
russdata = pd.concat((train,LB,test),ignore_index=1)
russdata.sort(['CID','Intensity'],inplace=1)
russdata.index = range(len(russdata))
russdata.head()


Out[2]:
CID source SMILES neglog10d Dilution Intensity INTENSITY/STRENGTH VALENCE/PLEASANTNESS bakery sweet ... name467 name468 name469 name470 name471 name472 name473 name474 name475 name476
0 126 train OC1=CC=C(C=O)C=C1 1 1/10 high 49.551020 48.956522 0.630435 24.347826 ... 0.008785 -0.007208 -0.004686 0.002605 -0.001857 0.005821 -0.004882 0.003520 -0.010491 0.000221
1 126 train OC1=CC=C(C=O)C=C1 3 1/1,000 low 24.653061 51.058824 0.411765 16.676471 ... 0.008785 -0.007208 -0.004686 0.002605 -0.001857 0.005821 -0.004882 0.003520 -0.010491 0.000221
2 176 train C(C)(=O)O 5 1/100,000 high 11.551020 48.461539 2.538462 6.692308 ... -0.002969 0.000474 0.001067 -0.004214 0.004671 0.006453 0.003067 -0.003971 -0.005304 0.001662
3 176 train C(C)(=O)O 7 1/10,000,000 low 4.551020 47.250000 0.562500 2.000000 ... -0.002969 0.000474 0.001067 -0.004214 0.004671 0.006453 0.003067 -0.003971 -0.005304 0.001662
4 177 train C(C)=O 3 1/1,000 high 33.265306 45.315790 8.421053 20.078947 ... 0.000275 -0.003037 0.001442 -0.003658 -0.005168 0.006038 -0.002373 0.000455 -0.000303 -0.003977

5 rows × 11731 columns


In [3]:
# load the descriptors and fill nan values with 0
descriptors = pd.read_csv(os.path.abspath('__file__' + "/../../../data/molecular_descriptors_data.txt"),sep='\t')
descriptors.set_index('CID', inplace=True)
descriptors.sort(inplace=True)
descriptors.fillna(value=0,inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
descriptors.ix[:,:]= min_max_scaler.fit_transform(descriptors)

# add squared values to the feature vector
descriptors_squares = descriptors**2
descriptors_squares.columns = [name + '_2' for name in descriptors.columns]
descriptors = pd.concat((descriptors,descriptors_squares),axis=1)

descriptors.head()


Out[3]:
complexity from pubmed MW AMW Sv Se Sp Si Mv Me Mp ... Psychotic-80_2 Psychotic-50_2 Hypertens-80_2 Hypertens-50_2 Hypnotic-80_2 Hypnotic-50_2 Neoplastic-80_2 Neoplastic-50_2 Infective-80_2 Infective-50_2
CID
126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 0.104364 ... 0 0 0 0 0 0 0 0 0 0
176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 0.056546 ... 0 0 0 0 0 0 0 0 0 0
177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 0.061860 ... 0 0 0 0 0 0 0 0 0 0
180 0.051167 0.104208 0.011542 0.121231 0.131248 0.127898 0.139362 0.099485 0.269504 0.064137 ... 0 0 0 0 0 0 0 0 0 0
196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 0.066793 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 9738 columns


In [4]:
# load morgan similarity features
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../data/morgan_sim.csv"), index_col=0)
# convert the column names (CIDs) to strings
morgan.columns = [str(name) for name in morgan.columns]

# add squared values to the feature vector
morgan_squares = morgan **2
# rename features to CID + '_2'
morgan_squares.columns = [name + '_2' for name in morgan.columns]
# concat
morgan = pd.concat((morgan,morgan_squares),axis=1) 

features = pd.concat((descriptors, morgan),axis=1)
features.shape


Out[4]:
(476, 14612)

In [5]:
features.reset_index(inplace=1)
features.columns = ['CID'] + list(features.columns)[1:]
features.head()


Out[5]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91305518_2 91411526_2 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.000013 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080
1 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.000124 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162
2 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000014 0.000092 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932
3 180 0.051167 0.104208 0.011542 0.121231 0.131248 0.127898 0.139362 0.099485 0.269504 ... 0.000124 0.000205 0.003729 0.000930 0.000641 0.000094 0.000607 0.001961 0.000229 0.001850
4 196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 ... 0.001029 0.000737 0.013662 0.009383 0.001954 0.000820 0.003130 0.005600 0.002189 0.010702

5 rows × 14613 columns


In [6]:
features = pd.concat((features,features),ignore_index=1)
features.sort('CID',inplace=1)
features.index = range(len(features))
print(features.shape)
features.head()


(952, 14613)
Out[6]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91305518_2 91411526_2 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.000013 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080
1 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.000013 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080
2 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.000124 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162
3 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.000124 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162
4 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000014 0.000092 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932

5 rows × 14613 columns


In [7]:
features = pd.concat((features,russdata[['neglog10d','Intensity']]),axis=1)
features.Intensity = (features.Intensity == 'high').astype(int)
features.index = range(len(features))
features.head()


Out[7]:
CID complexity from pubmed MW AMW Sv Se Sp Si Mv Me ... 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2 neglog10d Intensity
0 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 1 1
1 126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 ... 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080 3 0
2 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 5 1
3 176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 ... 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162 7 0
4 177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 ... 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932 3 1

5 rows × 14615 columns


In [8]:
features.to_csv('features.csv')
features.shape


Out[8]:
(952, 14615)

In [ ]: