This is Sarah's copy. Unless your name is Sarah, do not edit this file.
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator
In [3]:
#Data Cleaning
data = pd.read_excel("inputdata.xlsx")
data['EC_value'], data['EC_error'] = zip(*data['ELE_COD'].map(lambda x: x.split('±')))
data.head()
Out[3]:
In [4]:
#Setting up for molecular descriptors
n = data.shape[0]
list_of_descriptors = ['NumHeteroatoms','MolWt','ExactMolWt','NOCount','NumHDonors','RingCount','NumAromaticRings','NumSaturatedRings','NumAliphaticRings']
calc = Calculator(list_of_descriptors)
D = len(list_of_descriptors)
d = len(list_of_descriptors)*2 + 3
print(n,d)
In [5]:
#setting up the x and y matrices
X = np.zeros((n,d))
X[:,-3] = data['T']
X[:,-2] = data['P']
X[:,-1] = data['MOLFRC_A']
for i in range(n):
A = Chem.MolFromSmiles(data['A'][i])
B = Chem.MolFromSmiles(data['B'][i])
X[i][:D] = calc.CalcDescriptors(A)
X[i][D:2*D] = calc.CalcDescriptors(B)
In [14]:
print(data['NUM'].values.reshape(-1,1),X.shape)
new_x = pd.DataFrame(X)
new_x = X.append(len(X))
#new_x = np.concatenate((X, data['NUM'].values.reshape(-1,1)))
print(data['NUM'].values.reshape(-1,1),X.shape)
new_data = pd.DataFrame(X,columns=['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A'])
y = data['EC_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
In [20]:
#Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(1,), activation='relu', solver='adam', alpha=0.0001, max_iter=200, random_state=None)
mlp.fit(new_data[['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A']],y)
plt.figure(figsize=(4,4))
plt.scatter(y,mlp.predict(new_data[['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A']]))
plt.plot(lw=4,color='red')
In [ ]: