This is Sarah's copy. Unless your name is Sarah, do not edit this file.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator

In [3]:
#Data Cleaning
data = pd.read_excel("inputdata.xlsx")
data['EC_value'], data['EC_error'] = zip(*data['ELE_COD'].map(lambda x: x.split('±')))
data.head()


Out[3]:
NUM A B MOLFRC_A T P ELE_COD EC_value EC_error
0 1 [O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C O 0.004 298.15 101 1.166 ± 0.058 1.166 0.058
1 2 [O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C O 0.004 299.15 101 1.203 ± 0.06 1.203 0.06
2 3 [O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C O 0.004 300.15 101 1.242 ± 0.062 1.242 0.062
3 4 [O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C O 0.004 301.15 101 1.271 ± 0.064 1.271 0.064
4 5 [O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C O 0.004 302.15 101 1.289 ± 0.064 1.289 0.064

In [4]:
#Setting up for molecular descriptors
n = data.shape[0]
list_of_descriptors = ['NumHeteroatoms','MolWt','ExactMolWt','NOCount','NumHDonors','RingCount','NumAromaticRings','NumSaturatedRings','NumAliphaticRings']
calc = Calculator(list_of_descriptors)
D = len(list_of_descriptors)
d = len(list_of_descriptors)*2 + 3
print(n,d)


2523 21

In [5]:
#setting up the x and y matrices
X = np.zeros((n,d))
X[:,-3] = data['T']
X[:,-2] = data['P']
X[:,-1] = data['MOLFRC_A']
for i in range(n):
    A = Chem.MolFromSmiles(data['A'][i])
    B = Chem.MolFromSmiles(data['B'][i])
    X[i][:D]    = calc.CalcDescriptors(A)
    X[i][D:2*D] = calc.CalcDescriptors(B)

In [14]:
print(data['NUM'].values.reshape(-1,1),X.shape)  
new_x = pd.DataFrame(X)
new_x = X.append(len(X))
#new_x = np.concatenate((X, data['NUM'].values.reshape(-1,1)))
print(data['NUM'].values.reshape(-1,1),X.shape)    


new_data = pd.DataFrame(X,columns=['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A'])

y = data['EC_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


[[   1]
 [   2]
 [   3]
 ..., 
 [2635]
 [2636]
 [2637]] (2523, 21)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-14-76e72dcdbb35> in <module>()
      1 print(data['NUM'].values.reshape(-1,1),X.shape)
      2 new_x = pd.DataFrame(X)
----> 3 new_x = X.append(len(X))
      4 #new_x = np.concatenate((X, data['NUM'].values.reshape(-1,1)))
      5 print(data['NUM'].values.reshape(-1,1),X.shape)

AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [20]:
#Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(1,), activation='relu', solver='adam', alpha=0.0001, max_iter=200, random_state=None)
mlp.fit(new_data[['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A']],y)
plt.figure(figsize=(4,4))
plt.scatter(y,mlp.predict(new_data[['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A']]))
plt.plot(lw=4,color='red')


/Users/SarahsAdventure/miniconda3/lib/python3.5/site-packages/sklearn/neural_network/multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.
  % (), ConvergenceWarning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-d0017407186c> in <module>()
      3 mlp.fit(new_data[['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A']],y)
      4 plt.figure(figsize=(4,4))
----> 5 plt.scatter(y,mlp.predict(new_data[['NUM','NumHeteroatoms_A','MolWt_A','ExactMolWt_A','NOCount_A','NumHDonors_A','RingCount_A','NumAromaticRings_A','NumSaturatedRings_A','NumAliphaticRings_A','NumHeteroatoms_B','MolWt_B','ExactMolWt_B','NOCount_B','NumHDonors_B','RingCount_B','NumAromaticRings_B','NumSaturatedRings_B','NumAliphaticRings_B','T','P','MOLFRC_A']]))
      6 plt.plot(lw=4,color='red')

/Users/SarahsAdventure/miniconda3/lib/python3.5/site-packages/matplotlib/pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, hold, data, **kwargs)
   3256                          vmin=vmin, vmax=vmax, alpha=alpha,
   3257                          linewidths=linewidths, verts=verts,
-> 3258                          edgecolors=edgecolors, data=data, **kwargs)
   3259     finally:
   3260         ax.hold(washold)

/Users/SarahsAdventure/miniconda3/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/Users/SarahsAdventure/miniconda3/lib/python3.5/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
   3836 
   3837         # c will be unchanged unless it is the same length as x:
-> 3838         x, y, s, c = cbook.delete_masked_points(x, y, s, c)
   3839 
   3840         scales = s   # Renamed for readability below.

/Users/SarahsAdventure/miniconda3/lib/python3.5/site-packages/matplotlib/cbook.py in delete_masked_points(*args)
   1846         return ()
   1847     if (is_string_like(args[0]) or not iterable(args[0])):
-> 1848         raise ValueError("First argument must be a sequence")
   1849     nrecs = len(args[0])
   1850     margs = []

ValueError: First argument must be a sequence

In [ ]: