In [3]:
import rdkit
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr 
from sklearn import linear_model
from heapq import nlargest
from rdkit import Chem

In [4]:
fileName = "test.csv"
rowsOperating = 100000
train = pd.read_csv(fileName,nrows=rowsOperating)

In [5]:
from rdkit.Chem import AllChem

m = Chem.MolFromSmiles(train.smiles[0])
x1 = AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=512, useFeatures=True)
x2= AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512, useFeatures=True)
x3 = AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024, useFeatures=True)
x4 = AllChem.GetHashedAtomPairFingerprintAsBitVect(m, nBits=256)
x5 = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(m , nBits=256)
new_row = (x1 + x2 + x3 + x4 + x5)
features = new_row

In [6]:
for i in range(1,len(train.smiles)):
    if(i %10000 == 0):
        print i
    m = Chem.MolFromSmiles(train.smiles[i])
    x1 = AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=512, useFeatures=True)
    x2= AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512, useFeatures=True)
    x3 = AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024, useFeatures=True)
    x4 = AllChem.GetHashedAtomPairFingerprintAsBitVect(m, nBits=256)
    x5 = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(m , nBits=256)
    new_row = x1 + x2 + x3 + x4 + x5
    features = features + new_row


10000
20000
30000
40000
50000
60000
70000
80000
90000

In [7]:
len(features)


Out[7]:
256000000

In [8]:
new_feats = np.array(features)

In [9]:
new_feats = new_feats.reshape(rowsOperating,2560)

In [10]:
new_feats.shape


Out[10]:
(100000, 2560)

In [11]:
a = pd.DataFrame(new_feats)

In [12]:
a.to_csv("testing_0")

In [ ]: