creates the features matrix with the descriptor data plus morgan
adds the squared values as well


In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os

In [2]:
# load the descriptors and fill nan values with 0
descriptors = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/molecular_descriptors_data.txt"),sep='\t')
descriptors.set_index('CID', inplace=True)
descriptors.sort_index(inplace=True)
descriptors.fillna(value=0,inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
descriptors.ix[:,:]= min_max_scaler.fit_transform(descriptors)

# add squared values to the feature vector
descriptors_squares = descriptors**2
descriptors_squares.columns = [name + '_2' for name in descriptors.columns]
descriptors = pd.concat((descriptors,descriptors_squares),axis=1)

#descriptors.reset_index(inplace=1)

descriptors.head()


Out[2]:
complexity from pubmed MW AMW Sv Se Sp Si Mv Me Mp ... Psychotic-80_2 Psychotic-50_2 Hypertens-80_2 Hypertens-50_2 Hypnotic-80_2 Hypnotic-50_2 Neoplastic-80_2 Neoplastic-50_2 Infective-80_2 Infective-50_2
CID
126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 0.104364 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 0.056546 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 0.061860 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
180 0.051167 0.104208 0.011542 0.121231 0.131248 0.127898 0.139362 0.099485 0.269504 0.064137 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 0.066793 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 9738 columns


In [3]:
# load morgan similarity features
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/morgan_sim.csv"), index_col=0)
# convert the column names (CIDs) to strings
morgan.columns = [str(name) for name in morgan.columns]

# add squared values to the feature vector
morgan_squares = morgan **2
# rename features to CID + '_2'
morgan_squares.columns = [name + '_2' for name in morgan.columns]
# concat
morgan = pd.concat((morgan,morgan_squares),axis=1) 


features = pd.concat((descriptors, morgan),axis=1)
features.shape


Out[3]:
(476, 14612)

In [4]:
features.head()


Out[4]:
complexity from pubmed MW AMW Sv Se Sp Si Mv Me Mp ... 91305518_2 91411526_2 91541756_2 91552833_2 91563027_2 91595028_2 91614181_2 91617014_2 91617930_2 91618238_2
CID
126 0.181128 0.270753 0.030587 0.262264 0.219126 0.253846 0.214989 0.216981 0.425532 0.104364 ... 0.000013 0.000331 0.014024 0.000296 0.021098 0.000186 0.003159 0.002299 0.000138 0.011080
176 0.060311 0.109331 0.025411 0.096943 0.105579 0.090940 0.107335 0.125214 0.659574 0.056546 ... 0.000124 0.000205 0.008391 0.000930 0.001442 0.000094 0.000607 0.001362 0.000229 0.004162
177 0.020039 0.067721 0.015501 0.075556 0.083688 0.078074 0.089782 0.106346 0.382979 0.061860 ... 0.000014 0.000092 0.000961 0.000339 0.000657 0.000008 0.000098 0.000221 0.000037 0.001932
180 0.051167 0.104208 0.011542 0.121231 0.131248 0.127898 0.139362 0.099485 0.269504 0.064137 ... 0.000124 0.000205 0.003729 0.000930 0.000641 0.000094 0.000607 0.001961 0.000229 0.001850
196 0.221790 0.333247 0.023779 0.306622 0.308572 0.294339 0.305729 0.138079 0.539007 0.066793 ... 0.001029 0.000737 0.013662 0.009383 0.001954 0.000820 0.003130 0.005600 0.002189 0.010702

5 rows × 14612 columns


In [13]:
features.to_csv('features.csv')

In [ ]: