In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RandomizedLasso
import sys
import os
In [2]:
train = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_train.csv")).sort('CID')
LB = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_lb.csv")).sort('CID')
test = pd.read_csv(os.path.abspath('__file__' + "/../../../data/derived/meansx_test.csv")).sort('CID')
russdata = pd.concat((train,LB,test),ignore_index=1)
russdata.sort(['CID','Intensity'],inplace=1)
russdata.index = range(len(russdata))
russdata.head()
Out[2]:
In [3]:
# load the descriptors and fill nan values with 0
descriptors = pd.read_csv(os.path.abspath('__file__' + "/../../../data/molecular_descriptors_data.txt"),sep='\t')
descriptors.set_index('CID', inplace=True)
descriptors.sort(inplace=True)
descriptors.fillna(value=0,inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
descriptors.ix[:,:]= min_max_scaler.fit_transform(descriptors)
# add squared values to the feature vector
descriptors_squares = descriptors**2
descriptors_squares.columns = [name + '_2' for name in descriptors.columns]
descriptors = pd.concat((descriptors,descriptors_squares),axis=1)
descriptors.head()
Out[3]:
In [4]:
# load morgan similarity features
morgan = pd.read_csv(os.path.abspath('__file__' + "/../../../data/morgan_sim.csv"), index_col=0)
# convert the column names (CIDs) to strings
morgan.columns = [str(name) for name in morgan.columns]
# add squared values to the feature vector
morgan_squares = morgan **2
# rename features to CID + '_2'
morgan_squares.columns = [name + '_2' for name in morgan.columns]
# concat
morgan = pd.concat((morgan,morgan_squares),axis=1)
features = pd.concat((descriptors, morgan),axis=1)
features.shape
Out[4]:
In [5]:
features.reset_index(inplace=1)
features.columns = ['CID'] + list(features.columns)[1:]
features.head()
Out[5]:
In [6]:
features = pd.concat((features,features),ignore_index=1)
features.sort('CID',inplace=1)
features.index = range(len(features))
print(features.shape)
features.head()
Out[6]:
In [7]:
features = pd.concat((features,russdata[['neglog10d','Intensity']]),axis=1)
features.Intensity = (features.Intensity == 'high').astype(int)
features.index = range(len(features))
features.head()
Out[7]:
In [8]:
features.to_csv('features.csv')
features.shape
Out[8]:
In [ ]: