Osprey Report


Imports and convenience functions


In [ ]:
get_ipython().magic('pylab inline')
matplotlib.rcParams['figure.dpi'] = 600

import seaborn as sns
import corner

import json
from io import StringIO

import numpy as np
from pandas import DataFrame, Index
from scipy.stats import gaussian_kde

from osprey.config import Config
from msmbuilder.msm import MarkovStateModel

import matplotlib.pyplot as pp
import msmexplorer as msme

from msmreport.utils import convert_keys_to_string

rs = np.random.RandomState(42)

Retrieve osprey settings


In [ ]:
config = Config('{yaml}')
ss = config.search_space()

Retrieve optimal model parameters


In [ ]:
db = config.trials()
cmd = 'select parameters from trials_v3 where mean_test_score = (select MAX(mean_test_score) from trials_v3);'
query = db.execute(cmd)
result = query.fetchall()
params = convert_keys_to_string(json.load(StringIO(result[0][0])))
DataFrame(np.array(list(params.values()))[argsort(list(params.keys()))], Index(sort(list(params.keys())))).T

Load data and model pipeline


In [ ]:
data, labels = config.dataset()

pipeline = config.estimator()
pipeline.set_params(**params)

Fit model


In [ ]:
pipeline.fit(data)

tICA results


In [ ]:
scaler = pipeline.steps[0][1]
tica = pipeline.steps[1][1]
tica_data = tica.transform(scaler.transform(data))
_ = msme.plot_histogram(np.concatenate(tica_data, axis=0), color='oxblood', quantiles=(0.5,),
                        labels=['$tIC%s$' % i for i in range(tica.n_components)],
                        show_titles=True)

Clustering results


In [ ]:
_ = msme.plot_voronoi(pipeline.steps[2][1], xlabel='tIC0', ylabel='tIC1', cluster_centers=False)

MSM results

Plot top five timescales


In [ ]:
lt_range = ss.variables['tica__lag_time']
lag_times = np.linspace(lt_range.min, lt_range.max, 20).astype(int)

models = []
timescales = []
for lag_time in lag_times:
    model = MarkovStateModel(verbose=False, n_timescales=5,
                                     lag_time=lag_time, ergodic_cutoff=1)
    models.append(model.fit(pipeline.steps[2][1].labels_))
    timescales.append(model.timescales_)
timescales = np.array(timescales)

figsize(12, 6)
for timescale in timescales.T:
    _ = plot(lag_times, timescale)

xlabel('Lag Time\n($ns$)', size=14, labelpad=10)
ylabel('Relaxation Time ($ns$)', size=14, labelpad=10)
yscale('log')

Plot "best" timescale


In [ ]:
tidx = argmax(np.product((np.diff(timescales, axis=0) / timescales[:-1]) < 0.01, axis=1))
lag_time = lag_times[tidx]
msm = models[tidx]

_ = msme.plot_timescales(msm, ylabel=r'Relaxation Time ($ns$)')

Plot residuals


In [ ]:
_ = msme.plot_pop_resids(msm, color='tarragon')

Plot free energy landscape


In [ ]:
assigns = msm.fit_transform(pipeline.steps[2][1].labels_)
pi_0 = msm.populations_[np.concatenate(assigns, axis=0)]


# Free Energy Surface
obs = (0, 1)
ax = msme.plot_free_energy(np.concatenate(tica_data, axis=0),
                           obs=obs, n_samples=10000,
                           pi=pi_0, gridsize=100, vmax=5.,
                           n_levels=8, cut=5, xlabel='tIC0',
                           ylabel='tIC1', random_state=rs)

# MSM Network
pp.scatter(*pipeline.steps[2][1].cluster_centers_[:, obs].T,
           s = pipeline.steps[2][1].counts_ / 100.)