creates the features matrix with the descriptor data plus morgan
adds the squared values as well
In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os
In [2]:
# load the descriptors and fill nan values with 0
descriptors = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/molecular_descriptors_data.txt"),sep='\t')
descriptors.set_index('CID', inplace=True)
descriptors.sort_index(inplace=True)
descriptors.fillna(value=0,inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
descriptors.ix[:,:]= min_max_scaler.fit_transform(descriptors)
# add squared values to the feature vector
descriptors_squares = descriptors**2
descriptors_squares.columns = [name + '_2' for name in descriptors.columns]
descriptors = pd.concat((descriptors,descriptors_squares),axis=1)
#descriptors.reset_index(inplace=1)
descriptors.head()
Out[2]:
In [3]:
# load morgan similarity features
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../../data/morgan_sim.csv"), index_col=0)
# convert the column names (CIDs) to strings
morgan.columns = [str(name) for name in morgan.columns]
# add squared values to the feature vector
morgan_squares = morgan **2
# rename features to CID + '_2'
morgan_squares.columns = [name + '_2' for name in morgan.columns]
# concat
morgan = pd.concat((morgan,morgan_squares),axis=1)
features = pd.concat((descriptors, morgan),axis=1)
features.shape
Out[3]:
In [4]:
features.head()
Out[4]:
In [13]:
features.to_csv('features.csv')
In [ ]: