In [2]:
import rdkit
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from sklearn import linear_model
from heapq import nlargest
from rdkit import Chem
In [5]:
fileName = "train.csv"
train = pd.read_csv(fileName)
In [6]:
len(train)
Out[6]:
In [3]:
train = train[start:]
print len(train.smiles)
In [4]:
from rdkit.Chem import AllChem
m = Chem.MolFromSmiles(train.smiles[start])
x1 = AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=512, useFeatures=True)
x2= AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512, useFeatures=True)
x3 = AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024, useFeatures=True)
x4 = AllChem.GetHashedAtomPairFingerprintAsBitVect(m, nBits=256)
x5 = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(m , nBits=256)
new_row = x1 + x2 + x3 + x4 + x5
features = new_row
In [5]:
for i in range(start+1,end):
if(i %10000 == 0):
print i
m = Chem.MolFromSmiles(train.smiles[i])
x1 = AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=512, useFeatures=True)
x2= AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512, useFeatures=True)
x3 = AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024, useFeatures=True)
x4 = AllChem.GetHashedAtomPairFingerprintAsBitVect(m, nBits=256)
x5 = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(m , nBits=256)
new_row = x1 + x2 + x3 + x4 + x5
features = features + new_row
In [6]:
len(features)
Out[6]:
In [7]:
new_feats = np.array(features)
In [8]:
new_feats = new_feats.reshape(end-start,2560)
In [9]:
new_feats.shape
Out[9]:
In [10]:
a = pd.DataFrame(new_feats)
In [11]:
a.to_csv("train" + str(start))
In [ ]: