In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator

In [6]:
#Data Cleaning
data = pd.read_excel("inputdata.xlsx")
data['EC_value'], data['EC_error'] = zip(*data['ELE_COD'].map(lambda x: x.split('±')))
print(data.shape)
data.head()
print(data[2523:2525])


(2523, 9)
Empty DataFrame
Columns: [NUM, A, B, MOLFRC_A, T, P, ELE_COD, EC_value, EC_error]
Index: []

In [7]:
#Generate Selective Descriptors
def gen_selec_des(data):
    f = open('Deslist_selective','r')
    Deslist = []
    for line in f:
        Deslist.append(line.strip('\n\t'))
    print("Generating selective descriptors:")
    print(Deslist)
    calc = Calculator(Deslist)
    n = data.shape[0]
    D = len(Deslist)
    d = len(Deslist)*2+3
    X = np.zeros((n,d))
    Y = np.zeros((n,1))
    Z = np.zeros((n,1))
    X[:,-3] = data['T']
    X[:,-2] = data['P']
    X[:,-1] = data['MOLFRC_A']
    c = 0
    for i in range(n):
        if i == 2524 :
            print(data['A'][i])
        A = Chem.MolFromSmiles(data['A'][i])
        B = Chem.MolFromSmiles(data['B'][i])
        if A is None:
            print("%d A is invalid"%(i))
            continue
        if B is None:
            print("%d B is invalid"%(i))
            cotinue
        X[c][:D]    = calc.CalcDescriptors(A)
        X[c][D:2*D] = calc.CalcDescriptors(B)
        Y[c] = data['EC_value'][i]
        Z[c] = data['EC_error'][i]
        c += 1
    np.savetxt('Selective_descriptors_X',X)
    np.savetxt('Selective_descriptors_Y',Y)
    np.savetxt('Selective_descriptors_error',Z)

In [8]:
def gen_2D_des(data):
    f = open('Deslist_2D','r')
    Deslist = []
    for line in f:
        Deslist.append(line.strip('\n\t'))
    print('Generating 2D descriptors')
    print(Deslist)
    calc = Calculator(Deslist)
    n = data.shape[0]
    D = len(Deslist)
    d = len(Deslist)*2+3
    X = np.zeros((n,d))
    Y = np.zeros((n,1))
    Z = np.zeros((n,1))
    X[:,-3] = data['T']
    X[:,-2] = data['P']
    X[:,-1] = data['MOLFRC_A']
    c = 0
    for i in range(n):
        if i == 2524 :
            print(data['A'][i])
        A = Chem.MolFromSmiles(data['A'][i])
        B = Chem.MolFromSmiles(data['B'][i])
        if A is None:
            print("%d A is invalid"%(i))
            continue
        if B is None:
            print("%d B is invalid"%(i))
            cotinue
        X[c][:D]    = calc.CalcDescriptors(A)
        X[c][D:2*D] = calc.CalcDescriptors(B)
        Y[c] = data['EC_value'][i]
        c += 1
    np.savetxt('2D_descriptors_X',X)
    np.savetxt('2D_descriptors_Y',Y)
    np.savetxt('2D_descriptors_error',Z)

In [9]:
gen_selec_des(data)
gen_2D_des(data)


Generating selective descriptors:
['NumHeteroatoms', 'MolWt', 'ExactMolWt', 'NOCount', 'NumHDonors', 'RingCount', 'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings']
Generating 2D descriptors
['steiger', 'Marsili Partial Charges', 'BalabanJ', 'BertzCT', 'Ipc', 'HallKierAlpha', 'Kappa1', 'Kappa2', 'Kappa3', 'Chi0', 'Chi1', 'Chi0n', 'Chi1n', 'Chi2n', 'Chi3n', 'Chi4n', 'Chi0v', 'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v', 'MolLogP', 'MolMR', 'MolWt', 'HeavyAtomCount', 'HeavyAtomMolWt', 'NHOHCount', 'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumValenceElectrons', 'RingCount', 'TPSA', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SMR_VSA10', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'EState_VSA1', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'EState_VSA10', 'EState_VSA11', 'VSA_EState1', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'VSA_EState10', 'Topliss fragments']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-4cfc541e0004> in <module>()
      1 gen_selec_des(data)
      2 gen_2D_des(data)
----> 3 gen_

NameError: name 'gen_' is not defined

In [35]:
m1 = Chem.MolFromSmiles('c1ccccn1')
fp1 = AllChem.GetMorganFingerprint(m1,2)
v1 = AllChem.GetMorganFingerprintAsBitVect(m1,2,nBits=1024)
type(v1)


Out[35]:
rdkit.DataStructs.cDataStructs.ExplicitBitVect

In [ ]: