In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator
In [6]:
#Data Cleaning
data = pd.read_excel("inputdata.xlsx")
data['EC_value'], data['EC_error'] = zip(*data['ELE_COD'].map(lambda x: x.split('±')))
print(data.shape)
data.head()
print(data[2523:2525])
In [7]:
#Generate Selective Descriptors
def gen_selec_des(data):
f = open('Deslist_selective','r')
Deslist = []
for line in f:
Deslist.append(line.strip('\n\t'))
print("Generating selective descriptors:")
print(Deslist)
calc = Calculator(Deslist)
n = data.shape[0]
D = len(Deslist)
d = len(Deslist)*2+3
X = np.zeros((n,d))
Y = np.zeros((n,1))
Z = np.zeros((n,1))
X[:,-3] = data['T']
X[:,-2] = data['P']
X[:,-1] = data['MOLFRC_A']
c = 0
for i in range(n):
if i == 2524 :
print(data['A'][i])
A = Chem.MolFromSmiles(data['A'][i])
B = Chem.MolFromSmiles(data['B'][i])
if A is None:
print("%d A is invalid"%(i))
continue
if B is None:
print("%d B is invalid"%(i))
cotinue
X[c][:D] = calc.CalcDescriptors(A)
X[c][D:2*D] = calc.CalcDescriptors(B)
Y[c] = data['EC_value'][i]
Z[c] = data['EC_error'][i]
c += 1
np.savetxt('Selective_descriptors_X',X)
np.savetxt('Selective_descriptors_Y',Y)
np.savetxt('Selective_descriptors_error',Z)
In [8]:
def gen_2D_des(data):
f = open('Deslist_2D','r')
Deslist = []
for line in f:
Deslist.append(line.strip('\n\t'))
print('Generating 2D descriptors')
print(Deslist)
calc = Calculator(Deslist)
n = data.shape[0]
D = len(Deslist)
d = len(Deslist)*2+3
X = np.zeros((n,d))
Y = np.zeros((n,1))
Z = np.zeros((n,1))
X[:,-3] = data['T']
X[:,-2] = data['P']
X[:,-1] = data['MOLFRC_A']
c = 0
for i in range(n):
if i == 2524 :
print(data['A'][i])
A = Chem.MolFromSmiles(data['A'][i])
B = Chem.MolFromSmiles(data['B'][i])
if A is None:
print("%d A is invalid"%(i))
continue
if B is None:
print("%d B is invalid"%(i))
cotinue
X[c][:D] = calc.CalcDescriptors(A)
X[c][D:2*D] = calc.CalcDescriptors(B)
Y[c] = data['EC_value'][i]
c += 1
np.savetxt('2D_descriptors_X',X)
np.savetxt('2D_descriptors_Y',Y)
np.savetxt('2D_descriptors_error',Z)
In [9]:
gen_selec_des(data)
gen_2D_des(data)
In [35]:
m1 = Chem.MolFromSmiles('c1ccccn1')
fp1 = AllChem.GetMorganFingerprint(m1,2)
v1 = AllChem.GetMorganFingerprintAsBitVect(m1,2,nBits=1024)
type(v1)
Out[35]:
In [ ]: