creates the features matrix with the descriptor data plus morgan
adds the squared values as well



In [1]:

    
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os



In [2]:

    
# load the descriptors and fill nan values with 0
descriptors = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/molecular_descriptors_data.txt"),sep='\t')
descriptors.set_index('CID', inplace=True)
descriptors.sort_index(inplace=True)
descriptors.fillna(value=0,inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
descriptors.ix[:,:]= min_max_scaler.fit_transform(descriptors)

# add squared values to the feature vector
descriptors_squares = descriptors**2
descriptors_squares.columns = [name + '_2' for name in descriptors.columns]
descriptors = pd.concat((descriptors,descriptors_squares),axis=1)

#descriptors.reset_index(inplace=1)

descriptors.head()









    Out[2]:






  
    
      
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      Mp
      ...
      Psychotic-80_2
      Psychotic-50_2
      Hypertens-80_2
      Hypertens-50_2
      Hypnotic-80_2
      Hypnotic-50_2
      Neoplastic-80_2
      Neoplastic-50_2
      Infective-80_2
      Infective-50_2
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      0.104364
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      0.056546
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      177
      0.020039
      0.067721
      0.015501
      0.075556
      0.083688
      0.078074
      0.089782
      0.106346
      0.382979
      0.061860
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      180
      0.051167
      0.104208
      0.011542
      0.121231
      0.131248
      0.127898
      0.139362
      0.099485
      0.269504
      0.064137
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      196
      0.221790
      0.333247
      0.023779
      0.306622
      0.308572
      0.294339
      0.305729
      0.138079
      0.539007
      0.066793
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 9738 columns



In [3]:

    
# load morgan similarity features
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/morgan_sim.csv"), index_col=0)
# convert the column names (CIDs) to strings
morgan.columns = [str(name) for name in morgan.columns]

# add squared values to the feature vector
morgan_squares = morgan **2
# rename features to CID + '_2'
morgan_squares.columns = [name + '_2' for name in morgan.columns]
# concat
morgan = pd.concat((morgan,morgan_squares),axis=1) 


features = pd.concat((descriptors, morgan),axis=1)
features.shape









    Out[3]:





(476, 14612)



In [4]:

    
features.head()









    Out[4]:






  
    
      
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      Mp
      ...
      91305518_2
      91411526_2
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
    
    
      CID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      0.104364
      ...
      0.000013
      0.000331
      0.014024
      0.000296
      0.021098
      0.000186
      0.003159
      0.002299
      0.000138
      0.011080
    
    
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      0.056546
      ...
      0.000124
      0.000205
      0.008391
      0.000930
      0.001442
      0.000094
      0.000607
      0.001362
      0.000229
      0.004162
    
    
      177
      0.020039
      0.067721
      0.015501
      0.075556
      0.083688
      0.078074
      0.089782
      0.106346
      0.382979
      0.061860
      ...
      0.000014
      0.000092
      0.000961
      0.000339
      0.000657
      0.000008
      0.000098
      0.000221
      0.000037
      0.001932
    
    
      180
      0.051167
      0.104208
      0.011542
      0.121231
      0.131248
      0.127898
      0.139362
      0.099485
      0.269504
      0.064137
      ...
      0.000124
      0.000205
      0.003729
      0.000930
      0.000641
      0.000094
      0.000607
      0.001961
      0.000229
      0.001850
    
    
      196
      0.221790
      0.333247
      0.023779
      0.306622
      0.308572
      0.294339
      0.305729
      0.138079
      0.539007
      0.066793
      ...
      0.001029
      0.000737
      0.013662
      0.009383
      0.001954
      0.000820
      0.003130
      0.005600
      0.002189
      0.010702
    
  

5 rows × 14612 columns



In [13]:

    
features.to_csv('features.csv')



In [ ]:

	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	Mp	...	Psychotic-80_2	Psychotic-50_2	Hypertens-80_2	Hypertens-50_2	Hypnotic-80_2	Hypnotic-50_2	Neoplastic-80_2	Neoplastic-50_2	Infective-80_2	Infective-50_2
CID
126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	0.104364	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	0.056546	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
177	0.020039	0.067721	0.015501	0.075556	0.083688	0.078074	0.089782	0.106346	0.382979	0.061860	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
180	0.051167	0.104208	0.011542	0.121231	0.131248	0.127898	0.139362	0.099485	0.269504	0.064137	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
196	0.221790	0.333247	0.023779	0.306622	0.308572	0.294339	0.305729	0.138079	0.539007	0.066793	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0