In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# load data

import pymc
db = pymc.database.hdf5.load('300_adaptive_3gbmodel.h5')
is_param = lambda s:'_radius' in s or '_scalingFactor' in s

# inputs
params = []
for s in db._traces:
    if is_param(s):
        params.append(db._traces[s]())
params = np.array(params).T

# outputs
targets = db.dg_gbsa()
print(params.shape,targets.shape)

# experimental data
dg_exp = db.dg_exp

# -2 * log_likelihood
log_likelihood = -0.5 * db.deviance()


((770, 24), (770, 235))

In [4]:
log_likelihood.shape


Out[4]:
(770,)

In [5]:
plt.plot(log_likelihood[10:])


Out[5]:
[<matplotlib.lines.Line2D at 0x10f4d79d0>]

In [6]:
# how much linear structure is there in the inputs?

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(params)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('PC #')
plt.ylabel('Cumulative explained variance')
plt.figure()
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('PC #')
plt.ylabel('Explained variance')
print('How correlated are our samples? Can we use a small subset of these molecules for testing?')


How correlated are our samples? Can we use a small subset of these molecules for testing?

In [7]:
# how much linear structure is there among our ouputs?

pca = PCA()
pca.fit(targets)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('PC #')
plt.ylabel('Cumulative explained variance')
plt.figure()
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('PC #')
plt.ylabel('Explained variance')
print('How correlated are our outputs?')


How correlated are our outputs?

In [130]:
# let's maybe try to select a subset of the 235 compounds that best summarizes the behavior of the rest
# perhaps we can do this by inspecting the leading PCs?
plt.plot(pca.components_[0])


Out[130]:
[<matplotlib.lines.Line2D at 0x117546710>]

In [124]:
# hmm, let's instead use sparse PCA so that we can better interpret the components
from sklearn.decomposition import SparsePCA
spca = SparsePCA(alpha=0.9)
spca.fit(targets)


Out[124]:
SparsePCA(U_init=None, V_init=None, alpha=0.9, max_iter=1000, method='lars',
     n_components=None, n_jobs=1, random_state=None, ridge_alpha=0.01,
     tol=1e-08, verbose=False)

In [136]:
plt.plot(spca.components_[0]);
plt.xlabel('PC #')
plt.figure()
plt.plot(spca.components_[1]);
plt.xlabel('PC #')


Out[136]:
<matplotlib.text.Text at 0x117c86650>

In [29]:
# if we fit a PLS model between the inputs and outputs, how well do we do?
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression()
pls.fit(params,targets)
pls.score(params,targets)


Out[29]:
0.48881015562424535

In [8]:
# what if we fit an unregulized linear model between our inputs and outputs?
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(params,targets)
lr.score(params,targets)


Out[8]:
0.8550597958818138

In [9]:
# renaming
X = params
y = targets

In [10]:
# let's do this again except with K-fold CV
K = 10

from sklearn.cross_validation import KFold
kfold = KFold(len(params),K,shuffle=True,random_state=0)

train_scores = []
test_scores = []
mse_test = []

for train_index, test_index in kfold:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    train_scores.append(lr.score(X_train,y_train))
    test_scores.append(lr.score(X_test,y_test))
    mse_test.append(np.sum((lr.predict(X_test)-y_test)**2) / len(y_test))

In [11]:
print('Train scores: {0:.3f} +/- {1:.3f}'.format(np.mean(train_scores),np.std(train_scores)))
print('Test scores: {0:.3f} +/- {1:.3f}'.format(np.mean(test_scores),np.std(test_scores)))
print('Test MSE: {0:.3f} +/- {1:.3f}'.format(np.mean(mse_test),np.std(mse_test)))


Train scores: 0.856 +/- 0.002
Test scores: 0.836 +/- 0.025
Test MSE: 97.162 +/- 7.768

In [10]:
# hmm, that works surprisingly well-- maybe an even easier thing to do here
# would be to do bayesian linear regression and apply a

In [104]:
# now let's see how much better we can do if we instead use a GP to interpolate
from sklearn.gaussian_process import GaussianProcess
gp = GaussianProcess()
gp.fit(X,y)

# ugggg what does this error even mean


---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-104-67ea58460ecb> in <module>()
      1 from sklearn.gaussian_process import GaussianProcess
      2 gp = GaussianProcess()
----> 3 gp.fit(X,y)
      4 
      5 # ugggg what does this error mean

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/sklearn/gaussian_process/gaussian_process.pyc in fit(self, X, y)
    299         if (np.min(np.sum(D, axis=1)) == 0.
    300                 and self.corr != correlation.pure_nugget):
--> 301             raise Exception("Multiple input features cannot have the same"
    302                             " target value.")
    303 

Exception: Multiple input features cannot have the same target value.

In [12]:
# using the Sheffield library
import GPy

In [8]:
# omg look at all these cool models to try
coolmodels = [GPy.models.GPKroneckerGaussianRegression,
              GPy.models.GPHeteroscedasticRegression,
              GPy.models.SparseGPRegression,
              GPy.models.SparseGPRegressionUncertainInput]

In [9]:
# but first thing's first
gp = GPy.models.GPRegression(X_train,y_train)
gp.optimize()

In [10]:
gp.predict(X_test)[0].shape,y_test.shape


Out[10]:
((77, 235), (77, 235))

In [11]:
mse_test = []

for train_index, test_index in kfold:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gp = GPy.models.GPRegression(X_train,y_train)
    gp.optimize()
    mse_test.append(np.sum((gp.predict(X_test)[0]-y_test)**2) / len(y_test))

print('Test MSE: {0:.3f} +/- {1:.3f}'.format(np.mean(mse_test),np.std(mse_test)))


Test MSE: 29.545 +/- 2.850

In [ ]:
# okay so this works about ~3x better for unseen data than linear regression does... wooorth iiit?

In [ ]:
# let's pick a better output metric, maybe just one difficult-to-predict target?

In [13]:
import GPyOpt

In [14]:
# for the sake of visualization, let's first look at the projection
# of X that's maximally informative about Molecule 1

y_ = y[:,0]

lr = LinearRegression()
lr.fit(X,y_)

x_ = X.dot(lr.coef_).reshape((len(y),1))
y_ = y[:,0].reshape((len(y),1))

In [15]:
plt.rc('font', family='serif')

In [96]:
plt.scatter(x_,y_,linewidths=0,alpha=0.5,label='Samples')

plt.hlines(dg_exp[0],min(x_),max(x_),linestyles='--',
           label='Experimental')
plt.legend(loc='best')
plt.xlabel('Linreg_1(X)')
plt.ylabel(r'$\Delta G$ for Molecule 1')
plt.legend(loc='best')


Out[96]:
<matplotlib.legend.Legend at 0x11a09c7d0>

In [16]:
gp = GPy.models.GPRegression(x_,y_)
gp.optimize()

pred_fun = lambda x:gp.predict(x)[0]

In [17]:
gp.plot(data_symbol='k.')
i = 0
plt.title(r'Molecule {0}, Experimental $\Delta G$={1:.3f}'.format(i+1,dg_exp[i]))
plt.xlabel(r'Parameter set $\Theta$')
plt.ylabel(r'Simulated $\Delta G_{GBSA}$')
#plt.hlines(dg_exp[0],min(x_),max(x_))
#plt.ylim(dg_exp[0]-1.0,1)


Out[17]:
<matplotlib.text.Text at 0x1111f3690>

In [25]:
i=87
y_ = y[:,i]

lr = LinearRegression()
lr.fit(X,y_)

x_ = X.dot(lr.coef_).reshape((len(y_),1))
y_ = y_.reshape((len(y_),1))

plt.figure()
gp = GPy.models.GPRegression(x_,y_)
gp.optimize()
gp.plot(data_symbol='k.')
plt.hlines(dg_exp[i],min(x_),max(x_),linestyles='--')
plt.title(r'Molecule {0}, Experimental $\Delta G$={1:.3f}'.format(i+1,dg_exp[i]))
plt.xlabel(r'Parameter set $\Theta$')
plt.ylabel(r'Simulated $\Delta G_{GBSA}$')


Out[25]:
<matplotlib.text.Text at 0x1177a2890>
<matplotlib.figure.Figure at 0x106856a90>

In [71]:
# let's generate these plots for each of the molecules in our dataset

In [125]:
for i in range(y.shape[1]):
    y_ = y[:,i]

    lr = LinearRegression()
    lr.fit(X,y_)

    x_ = X.dot(lr.coef_).reshape((len(y_),1))
    y_ = y_.reshape((len(y_),1))

    plt.figure()
    gp = GPy.models.GPRegression(x_,y_)
    gp.optimize()
    gp.plot(data_symbol='k.')
    plt.hlines(dg_exp[i],min(x_),max(x_),linestyles='--')
    plt.title(r'Molecule {0}, Experimental $\Delta G$={1:.3f}'.format(i+1,dg_exp[i]))
    plt.xlabel(r'Parameter set $\Theta$')
    plt.ylabel(r'Simulated $\Delta G_{GBSA}$')
    plt.savefig('gp_plots/molecule_{0}.jpg'.format(i),dpi=300)

    plt.close()


<matplotlib.figure.Figure at 0x191fd1ed0>
<matplotlib.figure.Figure at 0x191fd1110>
<matplotlib.figure.Figure at 0x195c191d0>
<matplotlib.figure.Figure at 0x119c8df50>
<matplotlib.figure.Figure at 0x1ab81f490>
<matplotlib.figure.Figure at 0x1ab854150>
<matplotlib.figure.Figure at 0x1ad30f5d0>
<matplotlib.figure.Figure at 0x1ad2dd110>
<matplotlib.figure.Figure at 0x1bddd4950>
<matplotlib.figure.Figure at 0x1d11c7b10>
<matplotlib.figure.Figure at 0x191ef7650>
<matplotlib.figure.Figure at 0x16bcc87d0>
<matplotlib.figure.Figure at 0x123016dd0>
<matplotlib.figure.Figure at 0x119a83750>
<matplotlib.figure.Figure at 0x11e130c90>
<matplotlib.figure.Figure at 0x1257edc10>
<matplotlib.figure.Figure at 0x1367e0790>
<matplotlib.figure.Figure at 0x12673de10>
<matplotlib.figure.Figure at 0x11f566790>
<matplotlib.figure.Figure at 0x119ea7c10>
<matplotlib.figure.Figure at 0x1265d7ed0>
<matplotlib.figure.Figure at 0x1bdff5b10>
<matplotlib.figure.Figure at 0x1190cfa10>
<matplotlib.figure.Figure at 0x1190cf990>
<matplotlib.figure.Figure at 0x11b5f6c50>
<matplotlib.figure.Figure at 0x1190ab2d0>
<matplotlib.figure.Figure at 0x11aa7d210>
<matplotlib.figure.Figure at 0x118ef5e10>
<matplotlib.figure.Figure at 0x11a971a10>
<matplotlib.figure.Figure at 0x11bf85290>
<matplotlib.figure.Figure at 0x11a9ef050>
<matplotlib.figure.Figure at 0x13684cb50>
<matplotlib.figure.Figure at 0x11dd74910>
<matplotlib.figure.Figure at 0x12a8e10d0>
<matplotlib.figure.Figure at 0x11c05f1d0>
<matplotlib.figure.Figure at 0x12c542490>
<matplotlib.figure.Figure at 0x12c44f050>
<matplotlib.figure.Figure at 0x12c4d8790>
<matplotlib.figure.Figure at 0x163dadf50>
<matplotlib.figure.Figure at 0x16a40f9d0>
<matplotlib.figure.Figure at 0x16a5cfb90>
<matplotlib.figure.Figure at 0x16a439450>
<matplotlib.figure.Figure at 0x16a5f1750>
<matplotlib.figure.Figure at 0x16be67050>
<matplotlib.figure.Figure at 0x16a5fe790>
<matplotlib.figure.Figure at 0x16bf3d090>
<matplotlib.figure.Figure at 0x191e48350>
<matplotlib.figure.Figure at 0x191e32210>
<matplotlib.figure.Figure at 0x1915bdd10>
<matplotlib.figure.Figure at 0x191dba650>
<matplotlib.figure.Figure at 0x191dba450>
<matplotlib.figure.Figure at 0x19b2e4050>
<matplotlib.figure.Figure at 0x19b27d610>
<matplotlib.figure.Figure at 0x1b48a3850>
<matplotlib.figure.Figure at 0x19b46d050>
<matplotlib.figure.Figure at 0x1b4386a10>
<matplotlib.figure.Figure at 0x1b7ad4b90>
<matplotlib.figure.Figure at 0x1b7ab7950>
<matplotlib.figure.Figure at 0x1be179b50>
<matplotlib.figure.Figure at 0x1bdf5d390>
<matplotlib.figure.Figure at 0x9301ba410>
<matplotlib.figure.Figure at 0x93021ee10>
<matplotlib.figure.Figure at 0x9302f00d0>
<matplotlib.figure.Figure at 0x9302f0050>
<matplotlib.figure.Figure at 0x931835810>
<matplotlib.figure.Figure at 0x93186a9d0>
<matplotlib.figure.Figure at 0x94dd5d110>
<matplotlib.figure.Figure at 0x94cfad450>
<matplotlib.figure.Figure at 0x94d277b90>
<matplotlib.figure.Figure at 0x94d27c050>
<matplotlib.figure.Figure at 0x94dda2390>
<matplotlib.figure.Figure at 0x95ef7ccd0>
<matplotlib.figure.Figure at 0x960ea5d90>
<matplotlib.figure.Figure at 0x960e76dd0>
<matplotlib.figure.Figure at 0x96de245d0>
<matplotlib.figure.Figure at 0x976d08d90>
<matplotlib.figure.Figure at 0x976d08d10>
<matplotlib.figure.Figure at 0x984a97fd0>
<matplotlib.figure.Figure at 0x976d36750>
<matplotlib.figure.Figure at 0x97d5971d0>
<matplotlib.figure.Figure at 0x990f28f90>
<matplotlib.figure.Figure at 0x990ec3350>
<matplotlib.figure.Figure at 0x99bc97110>
<matplotlib.figure.Figure at 0x99bc1d110>
<matplotlib.figure.Figure at 0x96ddda1d0>
<matplotlib.figure.Figure at 0x9a8373990>
<matplotlib.figure.Figure at 0x99bb40510>
<matplotlib.figure.Figure at 0x9a832a490>
<matplotlib.figure.Figure at 0x99bdc6550>
<matplotlib.figure.Figure at 0x9b1074050>
<matplotlib.figure.Figure at 0x9b1419b10>
<matplotlib.figure.Figure at 0x9b102ef90>
<matplotlib.figure.Figure at 0x9bcde0fd0>
<matplotlib.figure.Figure at 0x9bcf11c10>
<matplotlib.figure.Figure at 0x9c8d73110>
<matplotlib.figure.Figure at 0x9bcdc76d0>
<matplotlib.figure.Figure at 0x9c96b4390>
<matplotlib.figure.Figure at 0x9c95eb210>
<matplotlib.figure.Figure at 0x9e7bdc9d0>
<matplotlib.figure.Figure at 0x9db70dcd0>
<matplotlib.figure.Figure at 0x9e7e7de50>
<matplotlib.figure.Figure at 0x9de536e50>
<matplotlib.figure.Figure at 0x9f24ed290>
<matplotlib.figure.Figure at 0x9f24bce50>
<matplotlib.figure.Figure at 0x9f22a9ed0>
<matplotlib.figure.Figure at 0x9f66d8190>
<matplotlib.figure.Figure at 0x9f6699f50>
<matplotlib.figure.Figure at 0x96de0a550>
<matplotlib.figure.Figure at 0x9c98a6310>
<matplotlib.figure.Figure at 0xa0a0290d0>
<matplotlib.figure.Figure at 0xa0a0291d0>
<matplotlib.figure.Figure at 0xa254c6590>
<matplotlib.figure.Figure at 0x9fb2fd8d0>
<matplotlib.figure.Figure at 0xa182a66d0>
<matplotlib.figure.Figure at 0xa256552d0>
<matplotlib.figure.Figure at 0xa258a2990>
<matplotlib.figure.Figure at 0xa2d0210d0>
<matplotlib.figure.Figure at 0xa2d077190>
<matplotlib.figure.Figure at 0xa2d242450>
<matplotlib.figure.Figure at 0xa4732add0>
<matplotlib.figure.Figure at 0xa472e2790>
<matplotlib.figure.Figure at 0xa3dcd0190>
<matplotlib.figure.Figure at 0xa5a374d50>
<matplotlib.figure.Figure at 0xa5a374cd0>
<matplotlib.figure.Figure at 0xa4b8dded0>
<matplotlib.figure.Figure at 0xa5a318a90>
<matplotlib.figure.Figure at 0xa5a3c8790>
<matplotlib.figure.Figure at 0xa715e4790>
<matplotlib.figure.Figure at 0xa5eb200d0>
<matplotlib.figure.Figure at 0xa7c684ed0>
<matplotlib.figure.Figure at 0xa7c684e90>
<matplotlib.figure.Figure at 0xa71683150>
<matplotlib.figure.Figure at 0xa71720510>
<matplotlib.figure.Figure at 0xa7e172310>
<matplotlib.figure.Figure at 0xa717a8150>
<matplotlib.figure.Figure at 0xa716c0190>
<matplotlib.figure.Figure at 0xa7c694210>
<matplotlib.figure.Figure at 0xa8ed2a990>
<matplotlib.figure.Figure at 0xa7e42b810>
<matplotlib.figure.Figure at 0xaa9947f10>
<matplotlib.figure.Figure at 0xaa01aaf90>
<matplotlib.figure.Figure at 0xaade5df90>
<matplotlib.figure.Figure at 0xa8ed35b50>
<matplotlib.figure.Figure at 0xaadf2e150>
<matplotlib.figure.Figure at 0xaadfa4550>
<matplotlib.figure.Figure at 0xab8730c50>
<matplotlib.figure.Figure at 0xab8730bd0>
<matplotlib.figure.Figure at 0xab8702fd0>
<matplotlib.figure.Figure at 0xab87a6490>
<matplotlib.figure.Figure at 0xac3437550>
<matplotlib.figure.Figure at 0xa9ab34090>
<matplotlib.figure.Figure at 0xad4f49b50>
<matplotlib.figure.Figure at 0xad3551f10>
<matplotlib.figure.Figure at 0xad4f04ed0>
<matplotlib.figure.Figure at 0xab886bed0>
<matplotlib.figure.Figure at 0xae7228b90>
<matplotlib.figure.Figure at 0xaefc8e590>
<matplotlib.figure.Figure at 0xaefd45450>
<matplotlib.figure.Figure at 0xaefe01c10>
<matplotlib.figure.Figure at 0xad4ef80d0>
<matplotlib.figure.Figure at 0xb0b88ac90>
<matplotlib.figure.Figure at 0xb0b88ac10>
<matplotlib.figure.Figure at 0xb0b8bd250>
<matplotlib.figure.Figure at 0xb0bb610d0>
<matplotlib.figure.Figure at 0xb14f32050>
<matplotlib.figure.Figure at 0xb0b940190>
<matplotlib.figure.Figure at 0xb15209990>
<matplotlib.figure.Figure at 0xb2ec61ed0>
<matplotlib.figure.Figure at 0xb29dbfc50>
<matplotlib.figure.Figure at 0xb2d4c9fd0>
<matplotlib.figure.Figure at 0xb2d4c9b50>
<matplotlib.figure.Figure at 0xb2ef32c50>
<matplotlib.figure.Figure at 0xb2eca6450>
<matplotlib.figure.Figure at 0xb323940d0>
<matplotlib.figure.Figure at 0xb32394250>
<matplotlib.figure.Figure at 0xb32482bd0>
<matplotlib.figure.Figure at 0x14a76f150>
<matplotlib.figure.Figure at 0xb392c96d0>
<matplotlib.figure.Figure at 0xb47ef8490>
<matplotlib.figure.Figure at 0x14a798b50>
<matplotlib.figure.Figure at 0xb73829cd0>
<matplotlib.figure.Figure at 0xb73887150>
<matplotlib.figure.Figure at 0x14a6c3dd0>
<matplotlib.figure.Figure at 0xb72055c50>
<matplotlib.figure.Figure at 0xb641e4710>
<matplotlib.figure.Figure at 0xb641c6410>
<matplotlib.figure.Figure at 0xb81c11d50>
<matplotlib.figure.Figure at 0xb5ce565d0>
<matplotlib.figure.Figure at 0xb5cec13d0>
<matplotlib.figure.Figure at 0xb71fbe050>
<matplotlib.figure.Figure at 0xb71fb5650>
<matplotlib.figure.Figure at 0xb913fb250>
<matplotlib.figure.Figure at 0xb91484ed0>
<matplotlib.figure.Figure at 0xb91484e90>
<matplotlib.figure.Figure at 0xb9143d150>
<matplotlib.figure.Figure at 0xbb6c0cf50>
<matplotlib.figure.Figure at 0xba8812b90>
<matplotlib.figure.Figure at 0xb9158dc10>
<matplotlib.figure.Figure at 0xba884d3d0>
<matplotlib.figure.Figure at 0xbaca9c110>
<matplotlib.figure.Figure at 0xbc4d8db90>
<matplotlib.figure.Figure at 0xbb6ce9b50>
<matplotlib.figure.Figure at 0xbc4d95850>
<matplotlib.figure.Figure at 0xbcb41da90>
<matplotlib.figure.Figure at 0xbda741050>
<matplotlib.figure.Figure at 0xbcf6c9090>
<matplotlib.figure.Figure at 0xbcfa80f50>
<matplotlib.figure.Figure at 0xbdab5df50>
<matplotlib.figure.Figure at 0xbc4dd6d90>
<matplotlib.figure.Figure at 0xbe8a30a90>
<matplotlib.figure.Figure at 0xbe89e20d0>
<matplotlib.figure.Figure at 0xbe8b92ed0>
<matplotlib.figure.Figure at 0xbfb19a610>
<matplotlib.figure.Figure at 0xbfb296d90>
<matplotlib.figure.Figure at 0xbfb296d10>
<matplotlib.figure.Figure at 0xbfb1ae6d0>
<matplotlib.figure.Figure at 0xc00426ed0>
<matplotlib.figure.Figure at 0xc17ba41d0>
<matplotlib.figure.Figure at 0xc15a948d0>
<matplotlib.figure.Figure at 0xc15a38910>
<matplotlib.figure.Figure at 0xc196ff0d0>
<matplotlib.figure.Figure at 0xc2daa7190>
<matplotlib.figure.Figure at 0xc22094b50>
<matplotlib.figure.Figure at 0xc21ea8f90>
<matplotlib.figure.Figure at 0xc21e7d410>
<matplotlib.figure.Figure at 0xc315d4b50>
<matplotlib.figure.Figure at 0xc31573590>
<matplotlib.figure.Figure at 0xc318c1150>
<matplotlib.figure.Figure at 0xc3fde5590>
<matplotlib.figure.Figure at 0xc3fcc2050>
<matplotlib.figure.Figure at 0xc477a2f90>
<matplotlib.figure.Figure at 0xc477f6110>
<matplotlib.figure.Figure at 0xc559f81d0>
<matplotlib.figure.Figure at 0xc558912d0>
<matplotlib.figure.Figure at 0xc558f5050>

What is the function we want to optimize?

Practically, we want to find a single parameter set to use in simulations, the maximum a posteriori estimate $\text{argmax}_\theta p(\theta | \mathcal{D})$.

For uncertainty quantification, we also want to characterize the full posterior distribution $p(\theta | \mathcal{D})$.


In [23]:
# for many of the above examples, the experimentally observed delta_G is outside the range
# of simulated delta_Gs



# how many? is the relation between parameters and 
converged = (y.max(0)>=dg_exp)*(y.min(0)<=dg_exp)

converged_scores = []
unconverged_scores = []

for i in range(len(converged)):
    y_ = y[:,i]

    lr = LinearRegression()
    lr.fit(X,y_)
    
    score = lr.score(X,y_)
    
    if converged[i]:
        converged_scores.append(score)
    else:
        unconverged_scores.append(score)

print('Linreg R^2 scores for "converged" molecules: {0:.3f} +/- {1:.3f}'.format(np.mean(converged_scores),np.std(converged_scores)))
print('Linreg R^2 scores for "unconverged" molecules: {0:.3f} +/- {1:.3f}'.format(np.mean(unconverged_scores),np.std(unconverged_scores)))


Linreg R^2 scores for "converged" molecules: 0.773 +/- 0.214
Linreg R^2 scores for "unconverged" molecules: 0.842 +/- 0.138

In [26]:
gp = GPy.models.GPRegression(x_,y_)
gp.optimize()

pred_fun = lambda x:gp.predict(x)[0]

bounds = (min(x_)-1.0,max(x_)+1.0)

# let's maybe render where the GP would suggest to 
myBopt = GPyOpt.methods.BayesianOptimization(f=pred_fun,            # function to optimize       
                                             bounds=bounds,        # box-constrains of the problem
                                             acquisition='EI',     # Selects the Expected improvement
                                             acquisition_par = 0)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-26-f00a171eb6be> in <module>()
     10                                              bounds=bounds,        # box-constrains of the problem
     11                                              acquisition='EI',     # Selects the Expected improvement
---> 12                                              acquisition_par = 0)

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/GPyOpt-0.0.1-py2.7.egg/GPyOpt/methods/bayesian_optimization.pyc in __init__(self, f, bounds, kernel, X, Y, model_data_init, model_optimize_interval, acquisition, acquisition_par, model_optimize_restarts, sparseGP, num_inducing, normalize, true_gradients, exact_feval, verbosity)
     63             self.model_data_init = model_data_init
     64         if X==None or Y == None:
---> 65             self.X = samples_multidimensional_uniform(self.bounds, self.model_data_init)
     66             self.Y = f(self.X)
     67         else:

/Users/joshuafass/anaconda/envs/py27/lib/python2.7/site-packages/GPyOpt-0.0.1-py2.7.egg/GPyOpt/util/general.pyc in samples_multidimensional_uniform(bounds, num_data)
     29     dim = len(bounds)
     30     Z_rand = np.zeros(shape=(num_data,dim))
---> 31     for k in range(0,dim): Z_rand[:,k] = np.random.uniform(low=bounds[k][0],high=bounds[k][1],size=num_data)
     32     return Z_rand
     33 

IndexError: index 1 is out of bounds for axis 0 with size 1

In [ ]: