Introduction to Weighted Ensemble Data Analysis in wepy

By: Samuel Lotz

Before we move on to the interesting stuff we first just set up some boilerplate stuff for running the tutorial.


In [1]:
import os
import os.path as osp
import random as rand
from pathlib import Path
import shutil as sh

import warnings

from PIL import Image

warnings.filterwarnings("ignore")

rand.seed(33)

## Inputs and Outputs
input_dir = Path("input")
outputs_dir = Path("_output")

media_dir = input_dir / 'media'

sh.rmtree(outputs_dir)
os.makedirs(outputs_dir, exist_ok=True)


def rescale(image, factor):
    """Helper function to show the images at the right scale"""
    new_shape = [int(dim * factor) for dim in image.size]
    new_image = image.resize(new_shape, Image.ANTIALIAS)
    
    return new_image

# load all the images
CSN_workflow_longway_im = Image.open(media_dir / "CSN_workflow_longway.png")
walker_history_im = Image.open(media_dir / "walker_history_schematic.png")
sim_manager_dataflow_im = Image.open(media_dir / "sim_manager_dataflow.png")

trad_md_windows_im = Image.open(media_dir / "trad_MD_windows.png")
we_md_windows_im = Image.open(media_dir / "we_MD_windows.png")
continuation_runs_im = Image.open(media_dir / "continuation_runs.png")
contig_tree_im = Image.open(media_dir / "contig_tree.png")
CSN_workflow_shortway_im = Image.open(media_dir / "CSN_workflow_shortway.png")
linking_files_im = Image.open(media_dir / "linking_files.png")

Running the simulation

Here we run some simulations so that we can showcase combining them for analysis. Since this tutorial isn't about running simulations we won't discuss this part much.

This first block is just to set up all the parameters and components for the simulation. We will be running multiple simulations with these and just change the reporters we are using.


In [2]:
import sys
from copy import copy, deepcopy
import os
import os.path as osp
import pickle

import numpy as np
import scipy.spatial.distance as scidist

import simtk.openmm.app as omma
import simtk.openmm as omm
import simtk.unit as unit

from openmm_systems.test_systems import LennardJonesPair
import mdtraj as mdj
from wepy.util.mdtraj import mdtraj_to_json_topology

from wepy.sim_manager import Manager

from wepy.resampling.distances.distance import Distance
from wepy.resampling.resamplers.wexplore import WExploreResampler
from wepy.walker import Walker
from wepy.runners.openmm import OpenMMRunner, OpenMMState
from wepy.runners.openmm import UNIT_NAMES, GET_STATE_KWARG_DEFAULTS
from wepy.work_mapper.mapper import Mapper
from wepy.boundary_conditions.unbinding import UnbindingBC
from wepy.reporter.hdf5 import WepyHDF5Reporter

from wepy.hdf5 import WepyHDF5



## PARAMETERS

# we use the Reference platform because this is just a test
PLATFORM = 'Reference'

# Langevin Integrator
TEMPERATURE= 300.0*unit.kelvin
FRICTION_COEFFICIENT = 1/unit.picosecond
# step size of time integrations
STEP_SIZE = 0.002*unit.picoseconds

# Resampler parameters

# the maximum weight allowed for a walker
PMAX = 0.5
# the minimum weight allowed for a walker
PMIN = 1e-12

# the maximum number of regions allowed under each parent region
MAX_N_REGIONS = (10, 10, 10, 10)

# the maximum size of regions, new regions will be created if a walker
# is beyond this distance from each voronoi image unless there is an
# already maximal number of regions
MAX_REGION_SIZES = (1, 0.5, .35, .25) # nanometers

# boundary condition parameters

# maximum distance between between any atom of the ligand and any
# other atom of the protein, if the shortest such atom-atom distance
# is larger than this the ligand will be considered unbound and
# restarted in the initial state
CUTOFF_DISTANCE = 1.0 # nm

# reporting parameters

# these are the properties of the states (i.e. from OpenMM) which will
# be saved into the HDF5
SAVE_FIELDS = ('positions', 'box_vectors', 'velocities')

# make a dictionary of units for adding to the HDF5
units = dict(UNIT_NAMES)



## System

# make the test system
test_sys = LennardJonesPair()

## Molecular Topology

mdtraj_topology = mdj.Topology.from_openmm(test_sys.topology)

json_str_top = mdtraj_to_json_topology(mdtraj_topology)

## Runner


# make the integrator
integrator = omm.LangevinIntegrator(TEMPERATURE, 
                                    FRICTION_COEFFICIENT,
                                    STEP_SIZE)

# make a context and set the positions
context = omm.Context(test_sys.system, 
                      copy(integrator))
context.setPositions(test_sys.positions)

# get the data from this context so we have a state to start the
# simulation with
get_state_kwargs = dict(GET_STATE_KWARG_DEFAULTS)
init_sim_state = context.getState(**get_state_kwargs)
init_state = OpenMMState(init_sim_state)

# initialize the runner
runner = OpenMMRunner(test_sys.system, 
                      test_sys.topology, 
                      integrator, 
                      platform=PLATFORM)

## Distance Metric
# we define a simple distance metric for this system, assuming the
# positions are in a 'positions' field
class PairDistance(Distance):

    def __init__(self, metric=scidist.euclidean):
        self.metric = metric

    def image(self, state):
        return state['positions']

    def image_distance(self, image_a, image_b):
        dist_a = self.metric(image_a[0], image_a[1])
        dist_b = self.metric(image_b[0], image_b[1])

        return np.abs(dist_a - dist_b)


# make a distance object which can be used to compute the distance
# between two walkers, for our scorer class
distance = PairDistance()

## Resampler
resampler = WExploreResampler(distance=distance,
                               init_state=init_state,
                               max_region_sizes=MAX_REGION_SIZES,
                               max_n_regions=MAX_N_REGIONS,
                               pmin=PMIN, pmax=PMAX)

## Boundary Conditions


# initialize the unbinding boundary conditions
ubc = UnbindingBC(cutoff_distance=CUTOFF_DISTANCE,
                  initial_state=init_state,
                  topology=json_str_top,
                  ligand_idxs=np.array(test_sys.ligand_indices),
                  receptor_idxs=np.array(test_sys.receptor_indices))


## Work Mapper

# a simple work mapper
mapper = Mapper()

## initial walkers
n_walkers = 4
init_weight = 1.0 / n_walkers
init_walkers = [Walker(OpenMMState(init_sim_state), init_weight) for i in range(n_walkers)]

## run parameters
n_cycles = 100
n_steps = 1000
# steps for each cycle
steps = [n_steps for i in range(n_cycles)]

First Simulation

Nothing fancy here, we just make a reporter that will generate the WepyHDF5 file. We need to do deepcopy on all the components because we will be reusing this initial state for the other simulations and passing them by reference here will mutate them. The Manager won't do copies automatically for you as this may be expensive in some situations.


In [3]:
run1_hdf5_reporter = WepyHDF5Reporter(
    file_path=str(outputs_dir / "results_run1.wepy.h5"),
                                 mode='w',
                                 save_fields=SAVE_FIELDS,
                                 resampler=resampler,
                                 boundary_conditions=ubc,
                                 topology=json_str_top,
                                 units=units)

sim_manager_1 = Manager(deepcopy(init_walkers),
                        runner=deepcopy(runner),
                        resampler=deepcopy(resampler),
                        boundary_conditions=deepcopy(ubc),
                        work_mapper=deepcopy(mapper),
                        reporters=[run1_hdf5_reporter]
                    )
(run1_walkers, 
 (run1_runner, run1_bc, run1_resampler)) = \
             sim_manager_1.run_simulation(n_cycles, steps)

Second Simulation

Same as the first one but we just want a separate dataset so we can demonstrate how to aggregate independently generated datasets later.


In [4]:
run2_hdf5_reporter = WepyHDF5Reporter(
    file_path=str(outputs_dir / "results_run2.wepy.h5"),
                                 mode='w',
                                 save_fields=SAVE_FIELDS,
                                 resampler=resampler,
                                 boundary_conditions=ubc,
                                 topology=json_str_top,
                                 units=units)

# run two simulations from the initial conditions
sim_manager_2 = Manager(deepcopy(init_walkers),
                        runner=deepcopy(runner),
                        resampler=deepcopy(resampler),
                        boundary_conditions=deepcopy(ubc),
                        work_mapper=deepcopy(mapper),
                        reporters=[run2_hdf5_reporter]
                    )
(run2_walkers, 
 (run2_runner, run2_bc, run2_resampler)) = \
             sim_manager_2.run_simulation(n_cycles, steps)

Third Simulation

For this simulation we will continue the second simulation using the outputs. While this isn't something you would likely do in practice all in the same script, this is something that must be dealt with when running very long simulations on machines which have restrictions on how long you can run them. To support this in a more robust way, there is the orchestration module which we encourage you to check out.


In [5]:
run3_hdf5_reporter = WepyHDF5Reporter(
    file_path=str(outputs_dir / "results_run3.wepy.h5"),
                                 mode='w',
                                 save_fields=SAVE_FIELDS,
                                 resampler=run2_resampler,
                                 boundary_conditions=run2_bc,
                                 topology=json_str_top,
                                 units=units)

# run two simulations from the initial conditions
sim_manager_3 = Manager(deepcopy(init_walkers),
                        runner=deepcopy(run2_runner),
                        resampler=deepcopy(run2_resampler),
                        boundary_conditions=deepcopy(run2_bc),
                        work_mapper=deepcopy(mapper),
                        reporters=[run3_hdf5_reporter]
                    )
(run3_walkers, 
 (run3_runner, run3_bc, run3_resampler)) = \
             sim_manager_3.run_simulation(n_cycles, steps)

Analysis Workflow

Now that we have generated all of our simulation data we can start analyzing it.

We will start with a workflow something like this, where on one branch (green) we want to do some dimensionally reduction and cluster the states we collected into "macrostates".

On the other branch (blue, red, purple) we will do the necessary bookkeeping to get the walker family tree taking into account both the cloning & merging as well as any walkers that have warped through boundary conditions.

In the end we can combine these states with the walker family tree to calculate the transition probabilities between those macrostates. This end result is something similar to a Markov State Model (MSM) which we call a Conformation State Model (CSN). The distinction merely refers to some rather rigorous requirements a true Markov State Model must have that we aren't directly attempting to verify here.


In [6]:
rescale(CSN_workflow_longway_im, 1.0)


Out[6]:

Calculating Observables

In order to do the macrostate clustering we need to calculate some interesting observables over the state data. You could use absolute positions of the particles or the energies or something that is calculated on-the-fly by the simulations but we want to showcase how to do this yourself. For a real molecular system this might be the solvent accessible surface area or the radius of gyration.

Because our example is very simple (just two particles) there isn't much we can do. So we'll just calculate the Euclidean distance between the two particles. Not coincidentally, this is also the distance metric we used. We are actually using this distance for a similar purpose (clustering) except while the resampler (WExplore) was doing this "on-the-fly" we have all of the simulation data at once and so will likely get better boundaries between clusters than our resampler.

To do this we have to define a function pair_dist_obs of a particular form that we can then pass to the WepyHDF5.compute_observable function which maps it over the data. Since this is a small simulation with small molecules we can do it in memory in a single process. Real simulation data can be much larger, for this you are free to implement your own computational strategies using a lower level API or you can see if the methods in the wepy.analysis.distributed module are suitable, which uses the distributed computing framework dask.


In [7]:
def pair_dist_obs(fields_d, *args, **kwargs):
    atomA_coords = fields_d['positions'][:,0]
    atomB_coords = fields_d['positions'][:,1]
    
    dists = np.array([np.array([scidist.euclidean(atomA_coords[i],
                                        atomB_coords[i])])
                      for i in range(atomA_coords.shape[0])
                     ])
    return dists


wepy1 = WepyHDF5(outputs_dir / 'results_run1.wepy.h5', mode='r+')
with wepy1:

    # compute the observable with the function 
    # and automatically saving it as an extra trajectory field
    obs = wepy1.compute_observable(pair_dist_obs, 
                                   ['positions'],
                                   (),
                                   save_to_hdf5='pair_dist',
                                   return_results=True
                                  )

When we run compute_observable it both returns the values and saves them into the HDF5 file. Depending on your workflow you may favor one over the other. In the end having the observables saved in the same HDF5 file makes indexing them easier and allows for some fancy queries when you rearrange the data into networks and trees.

We can see what the shape of this data is and that it matches the number of walkers (4) and cycles (100) we ran the simulation with.

We can also see that the computed feature is a 1-D feature vector. We retain the rank of a vector rather than a scalar because in most machine learning pipelines a vector is expected. Even if in this case its only length one.


In [8]:
print("number of walkers:", len(obs))
print("number of cycles:", obs[0].shape[0])
print("feature vector shape:", obs[0].shape[1:])


number of walkers: 4
number of cycles: 100
feature vector shape: (1,)

Clustering Simulation Samples

For a larger dataset clustering can be significantly more involved as it will likely need to be done either using special clustering algorithms (like the KCenters algorithm implemented in MSMBuilder) and you will probably want to partition your data into training and test sets to validate your model.

For our purposes here we will just use a simple fast clustering method from sklearn, i.e. the MiniBatchKMeans algorithm just so you can appreciate the mechanism and so we have some macrostate labels to use for the later analyses.

First we have to get our data as a uniform array of feature vectors which is simple as concatenating the features for each walker trajectory together:


In [9]:
with wepy1:
    features = np.concatenate([wepy1.get_traj_field(0, i, 'observables/pair_dist')
                               for i in range(wepy1.num_run_trajs(0))])
    
print(features.shape)


(400, 1)

Then we can choose our classifier hyperparameters and train it:


In [10]:
from sklearn.cluster import MiniBatchKMeans

clf = MiniBatchKMeans(n_clusters=4,
                      batch_size=10,
                      random_state=1)
clf.fit(features)
print(clf.labels_.shape)
print(clf.labels_[0:10])


(400,)
[3 1 3 1 1 2 0 2 0 3]

Once we have the trained classifier and we don't have a test set, we can just use the already determined labels for the feature vectors and then assign this to the simulation dataset as it's own observable. To do this we just need to destructure the feature set back into trajectory shapes and use the add_observable method on our WepyHDF5 dataset:


In [11]:
with wepy1:
    
    # destructure the features
    obs_trajs = []
    start_idx = 0
    for traj_idx in range(wepy1.num_run_trajs(0)):
        num_traj_frames = wepy1.num_traj_frames(0, traj_idx)
        
        obs_trajs.append(clf.labels_[start_idx : start_idx + num_traj_frames])

        start_idx += num_traj_frames
        
    print("observables shape:", len(obs_trajs), len(obs_trajs[0]))        
    
    # add it as an observable
    wepy1.add_observable('minibatch-kmeans_pair-dist_4_10_1',
                        [obs_trajs])


observables shape: 4 100

Thats it for clustering!

For now we will leave it as is and do more things with the macrostate networks once we calculate the transition probabilities.

Just remember that even though we are dealing with data as nice big chunks that we are even calling "trajs" or "trajectories" they are not! The name "trajectory" is simply a convenient name for a collection of molecular dynamics frames, i.e. a slice of data containing positions, box vectors, and potentially velocities and energies, that is familiar to the community.

A single trajectory may be linear for portions where cloning and merging did not occur, but in others the cloned walkers will be swapped in and out along the "trajectory".

However, if we ran a simulation in which no cloning and merging occurs (say with the NoResampler) then the trajectories indeed would be valid continuous trajectories. Strictly speaking nothing in wepy enforces this and in reality resamplers are free to mangle the contiguity of each trajectory. That is, never rely on the well-orderdness of trajectories in WepyHDF5 runs!

The following sections show the correct approach and how to extract contiguous trajectories from walker trees.

Computing Macrostate Transition Counts & Probabilities

Now that we have computed the observables and clustered them we can compute the transitions between those macrostates. The first step is a purely mechanical process and is just book-keeping about where and when walkers were cloned and merged. You can think of it as just making the family lineage. During the simulation the individual clones and merges are recorded as distinct events, like individual facts. Now we have to go back through this and assemble it into a tree-like data structure.

In the second revision of this workflow we will show a higher-level way to do this, but which also makes use of multiple branching simulations. For now we do it the semi-manual way of transforming the "resampling records" to a "resampling panel" (the run_resampling_panel method) and then condensing this to a "net parent table" (the parent_panel function). Finally we add a mask to this table to account for any walkers that warped through boundary conditions (the parent_table_discontinuities). This is necessary for getting transition probabilities correctly since movement through a boundary acts like a probability sink and doesn't represent a physical transition.

The resampler.DECISION and UnbindingBC classes are simply the ones we used in the simulations above. These classes contain methods that allow for proper accounting to be done. The resampler.DECISION is just MultiCloneMergeDecision.


In [32]:
from wepy.analysis.parents import (
    parent_panel, 
    net_parent_table, 
    parent_table_discontinuities,
)

with wepy1:
    # make a parent matrix from the hdf5 resampling records for run 0
    resampling_panel = wepy1.run_resampling_panel(0)
    
    # get the warping records
    warping_records = wepy1.warping_records([0])
    
parent_panel = parent_panel(
    resampler.DECISION, 
    resampling_panel)

parent_table = net_parent_table(parent_panel)

parent_table_discs = parent_table_discontinuities(
    UnbindingBC,
    parent_table, 
    warping_records
)

Side Notes on Lineages and Continuous Trajectories

Now that we have the entire table (which represents the walker tree) we can find contiguous trajectories from it. One way to do this is via the ancestors function which works on the parent table. We just need to choose a walker at a certain point of time and it will retrieve the entire history of this walker up until that point.

In this example I picked one of the walkers at the end of the simulation and showed the first few entries from the resulting "trace". Traces in wepy are essentially collections of complex indices on the runs, trajectories, and frames (and also contigs which will be discussed later) in the data. Because these traces are untyped it can sometimes be confusing as to what the indices are. Be sure to pay close attention to the documentation which should tell you. There are also some disambiguations in the glossary as to the different types of traces.

Also pay attention to the fact that we are using the parent_table, not parent_table_discs. A discontinuous warp breaks a lineage from the ancestors function.


In [33]:
from wepy.analysis.parents import ancestors

lineage_trace = ancestors(parent_table,
                          100,
                          3
                         )

print(lineage_trace[0:3])


[(3, 0), (3, 1), (3, 2)]

In this trace the tuples are complex indices where the values are: traj_idx, cycle_idx. Which uniquely identifies a walker at a single point in time relative to the parent table. Because the parent table was created straight from run 0 these indices are valid for that run.

Using this information we can extract the fields of these walkers as a collection of arrays. For this kind of trace we use the get_run_trace_fields since we only are concerned with a single run. Here we are getting the weights of the walkers and the pair distance which calculated earlier.


In [34]:
with wepy1:
    lineage_fields = wepy1.get_run_trace_fields(0, 
                                                lineage_trace[:-1],
                                               ['weights',
                                               'observables/pair_dist'])
print("weights:")
print(lineage_fields['weights'][0:3])
print("LJ-pair distance:")
print(lineage_fields['observables/pair_dist'][0:3])


weights:
[[0.25]
 [0.25]
 [0.25]]
LJ-pair distance:
[[0.36995744]
 [0.39919709]
 [0.37443133]]

Notice that we are excluding the last entry in the trace. Is this an error? It is not, but sometimes the indexes can be confusing. When trying to debug these kinds of issues, it can be very useful to have these two diagrams handy as a reference to the order of events taking place in the simulation.

The first is a schematic that focuses on the generation and inheritance of walker states at an abstract level. In this diagram the colors indicate unique states and the boxes they sit in represent the walker index, which is the trajectory index in a WepyHDF5 run.

This figure is particularly helpful in understanding things like the parent table, especially when combined with boundary conditions. The dot and slash in the boundary condition lines indicate that boundary warping can be continuous or discontinuous. As you can see the slash is a discontinuous even because the purple state is replaced with the original white state. Indicating a source-sink non-equilibrium cycle. The dotted line in the resampling portion indicates that walker 1 was merged into walker 2 (AKA "squashed") and donated its weight to it but ending the lineage of it's state.


In [35]:
rescale(walker_history_im, 1.0)


Out[35]:

In this diagram we show the actual flow of data in the wepy simulation manager and components.


In [36]:
rescale(sim_manager_dataflow_im, 0.65)


Out[36]:

To explain the wrinkle in the trace indexing above we must consider that the parent table above was generated from all of the resampling records available. Resampling occurs after dynamics is finished, and as the name implies generates no novel samples. And so rather than recording frames twice in the HDF5 before and after sampling we simply record the action of what happens.

When we ran the simulation above and returned the walkers from run_simulation these walkers were the resampled walkers, but those in the HDF5 are those directly produced by the runner. This way we never lose any sampled data. Even if that walker is subsequently warped or merged away.

Because wepy is focused on molecular dynamics simulations we even provide a method for generating mdtraj trajectories from traces. This is hugely useful for quickly converting to a more common format that can be then read by molecular structure viewers like VMD, PyMOL, or Chimera. This relies on the topology of the molecule also being stored in the HDF5 file (currently this is achieved using a JSON format file). But just know that you never need to rely on this method to get the data you need for this kind of conversion i.e. positions and box vectors.


In [37]:
with wepy1:
    mdj_traj = wepy1.run_trace_to_mdtraj(0, 
                                         lineage_trace[:-1])
print(mdj_traj)
# save one of the frames as a PDB as a reference topology
mdj_traj[0].save_pdb(str(outputs_dir / 'lj-pair.pdb'))
# save the lineage as a DCD trajectory
mdj_traj.save_dcd(str(outputs_dir / 'lj-pair_walker_lineage.pdb'))


<mdtraj.Trajectory with 100 frames, 2 atoms, 2 residues, and unitcells>

Also note that these trajectories do not include the actual initial structures that started the simulations. These too can be accessed if you need them via the initial_walker_fields and initial_walkers_to_mdtraj methods.

Back to calculating Transition Probabilities

Now that we have a better idea of the continuity of walker trajectories in our simulation data we can turn our attention back to generating transition probabilities between our labelled macrostates.

The simplest way to do this is simply to go through the continuous trajectories and take each pair of states A -> B, look at the labels and then tally the macrostate transition counts.

Naively it seems our job is done. However, things get complicated when we want to consider lag times over the trajectories. That is recording the macrostate transition between not just A -> B but rather the A -> C transition given A -> B -> C. This is desirable for a number of reasons when computing probabilities using stochastic methods like Markov State Models where we want memoryless transitions.

Doing this over trajectory trees is not as trivial. First consider what this lag time of 3 would look like for linear MD.


In [38]:
rescale(trad_md_windows_im, 0.6)


Out[38]:

Now consider a branchpoint in a tree of trajectories. We must be very careful not to double count or undercount transitions.


In [39]:
rescale(we_md_windows_im, 0.6)


Out[39]:

The algorithm to achieve this has been implemented in the sliding_window function which generates traces corresponding to the red segments in the above figures.


In [42]:
from wepy.analysis.parents import sliding_window
from wepy.analysis.transitions import run_transition_probability_matrix

# use the parent matrix to generate the sliding windows
windows = list(sliding_window(np.array(parent_table_discs), 10))

These window traces can then be combined with cluster labels (as an observable) to produce a transition probability matrix directly. See the wepy.analysis.transitions module for other similar functions if you want to control how you are aggregating forward and backward transitions.


In [44]:
# make the transition matrix from the windows
with wepy1:
    transprob_mat = run_transition_probability_matrix(
        wepy1, 
        0,
        "observables/minibatch-kmeans_pair-dist_4_10_1",
        windows)

print(transprob_mat)


[[0.26923077 0.27586207 0.28571429 0.22727273]
 [0.34615385 0.27586207 0.35714286 0.36363636]
 [0.23076923 0.13793103 0.21428571 0.13636364]
 [0.15384615 0.31034483 0.14285714 0.27272727]]

Contig Trees

The final step before we can construct a network of macrostates is to combine everything into a ContigTree abstract data type. In early prototypes we supported construction of networks from both ContigTree and parent-table/transition-matrix. This was cumbersome and confusing and so we consolidated this interface to only use ContigTree as it is more general, powerful, and easier to use as a user.

If you have fastidiously followed up until this point you may be disappointed to learn that most of these steps are also not exactly needed when using the ContigTree. However, these data structures are still used in certain contexts and are useful in helping to understand all the layers of structure.

So lets update our original workflow to see where the ContigTree fits in.


In [45]:
rescale(CSN_workflow_shortway_im, 1.0)


Out[45]:

As you can see most of those complex intermediary formats are folded into the ContigTree representation. The ContigTre is a combination of a network data structure (networkx.DiGraph) and the WepyHDF5 data.

In this tree each node is a single cycle of a simulation. Thus it can express not only ensembles of walkers cloning and merging but whole simulations of ensembles which are branching. Its like there is a tree inside of a tree. Technically, they are forests as well since there is often multiple roots. This whole tree inside a tree is too much complexity for what we are doing at the moment so we will defer our discussion of it until we need it when analyzing multiple simulations.

What we want is to get back to the part where we had a transition probability matrix and combine that with our macrostate labels to make a CSN.

So we simply construct the ContigTree with our dataset and the classes that describe the cloning & merging and boundary conditions like we did before.


In [63]:
from wepy.analysis.contig_tree import ContigTree
contigtree = ContigTree(wepy1, 
                         decision_class=resampler.DECISION,
                         boundary_condition_class=UnbindingBC,
                        )

In [64]:
resampler.DECISION.ENUM.SQUASH


Out[64]:
<CloneMergeDecisionEnum.SQUASH: 3>

What have we done? Well the ContigTree has an extensive API for a lot of stuff so lets try to focus in on a few interesting ones before we get to the transition probabilities.

One of the main features is a richer set of functions for generating commonly useful continuous trajectories.

Here we can get the indices (as unorderd traces) for:

  • all walkers were warped
  • all of the final walkers in the simulation
  • all of the squashed walkers

In [70]:
warp_events = contigtree.warp_trace()
final_walkers = contigtree.final_trace()
squashed_walkers = contigtree.resampling_trace(resampler.DECISION.ENUM.SQUASH)

And then we can take these events and generate all of the full lineages for them in one line.


In [76]:
warp_lineages = contigtree.lineages(squashed_walkers)
print(warp_lineages)


[]

Since we only have a single run our contig tree is really just a single contig which we can get with this little bit of magic:


In [58]:
with contigtree:
    contig = contigtree.make_contig(contigtree.spanning_contig_traces()[0])
    
print(contig)


<wepy.analysis.contig_tree.Contig object at 0x7f77633def50>

This Contig object is a subclass of the ContigTree but isn't a tree. This restriction opens a few more doors to us that we will use, but for now it makes the abstraction simpler.


In [ ]:
contig_tree.sliding_windows(3)[-5:-1]

Multi-Run Data Analysis

So far we have assumed nice contiguous data. This is not so with MD data in general which may be split up into different runs.

If you've checked out thinking about meta-trees, don't worry. You don't really have to think about it. The benefit is however that if the need ever arises and you have multiple simulations run from a single intermediary checkpoint, you won't have to throw out one or the other continuations


In [47]:
rescale(continuation_runs_im, 0.8)


Out[47]:

We want to be able to get transitions between the original run and all of its continuations without double counting.

Contig Tree

The contig tree is a tree (technically forest) of continuous simulations. For us the branchings happen when a run is continued multiple times.


In [46]:
rescale(contig_tree_im, 0.85)


Out[46]:

The Spanning Contigs are the contigs drawn before: (0,3) (0,4) (0,5) (1,) (2,)

Storage Implementation: Continuations

The only thing that must exist is a specification of the continuations that exist in a WepyHDF5 file.


In [ ]:
print("Runs in this file:", wepy1.run_idxs)
print("Continuations using the API method:\n", wepy1.continuations)
print("Where it is actually stored in the HDF5:\n", wepy1.h5['_settings/continuations'][:])

Run 1 continues run 0, and run 2 continues run 0.

This only works within a single file (but we can do interfile linking).

API for interacting with continuations: Contigs

A contig is a list of runs that form a contiguous dataset.

For the continuation (1,0) the contig is (0,1).

Contigs can be any number of runs.


In [ ]:
print("Contig {} has {} frames".format([0], wepy1.contig_n_cycles([0])))
print("Contig {} has {} frames".format([1], wepy1.contig_n_cycles([1])))
print("Contig {} has {} frames".format([2], wepy1.contig_n_cycles([2])))
print("Contig {} has {} frames".format([0,1], wepy1.contig_n_cycles([0,1])))
print("Contig {} has {} frames".format([0,2], wepy1.contig_n_cycles([0,2])))
#wepy1.resampling_records_dataframe([0,1])

A spanning contig is a contig which goes from a root run (a run that does not continue another run) and a leaf run (a run which is not continued). These can be enumerated:


In [ ]:
spanning_contigs = wepy1.spanning_contigs()
print("The spanning contigs:", spanning_contigs)

The Contig Tree Class

Since we have given it everything it needs to make the parent table from the previous example it can automate it all with the appropriate sliding windows algorithms for multiple runs!

The Macro-State Network Class

In order to make a Conformation State Network (CSN) we need to have the transitions (end-points of sliding windows) and the labels of the micro-states (frames of simulations) which can be from clustering or whatever else.

Because, a ContigTree can in the most degenerate form be a single run it is the most appropriate inputs for a general macro-state network.


In [ ]:
from wepy.analysis.network import MacroStateNetwork

random_state_net = MacroStateNetwork(contig_tree, transition_lag_time=3,
                                     assg_field_key="observables/rand_assg_idx")

Because this object directly wraps the WepyHDF5 file, we have access to all the data including weights, positions, box sizes etc. and so we can perform all kinds of fancy operations on a macrostate basis.

Weights/Free Energy of the Macrostates

Since we are principally interested in the free energy of macrostates the weights of the macrostates are a canonical example.


In [ ]:
# compute the weights of the macrostates and set them as node attributes
random_state_net.set_nodes_field('Weight', random_state_net.macrostate_weights())

# get the weight of a node
print(random_state_net.graph.node[39]['Weight'])

Transition Probabilitiy Matrix

To make an actual network we would need the transition probabilities as well, which were calculated with the lag time given when we created the network.


In [ ]:
print(random_state_net.probmat)

Updated User Workflow

Synergism with CSNAnalysis

This is the assymetric probability matrix. You can calculate interesting things with it related to paths etc. with the CSNAnalysis package from here on.

The purpose of this class is to calculate transition probabilities and create a direct interface to microstates in the HDF5 from a macrostate perspective.


In [ ]:
from csnanalysis.csn import CSN
from csnanalysis.matrix import *

csn = CSN(random_state_net.countsmat, symmetrize=True)

Committor Probabilities

Calculate the distance between the two particles:


In [ ]:
from scipy.spatial.distance import euclidean

dists = []
for node_id, field_d in random_state_net.iter_nodes_fields(['positions']).items():
    
    # just use the positions of the first frame in the cluster
    pos_A, pos_B = field_d['positions'][0][0], field_d['positions'][0][1]
    
    dist = euclidean(pos_A, pos_B)
    dists.append(dist)

Determine the source and sink basins and compute the committor probabilities:


In [ ]:
# the sink basins are those close to the unbinding cutoff
sink_basin = [int(i) for i in np.argwhere(np.array(dists) > 2.5)]
# the source basins are where they are close together
source_basin = [int(i) for i in np.argwhere(np.array(dists) < 0.37)]

print("Number of sink states:", len(sink_basin))
print("Number of source states:", len(source_basin))

committor_probabilities = csn.calc_committors([source_basin, sink_basin])
committor_probabilities[39]

Microstates in the Macrostate

The macrostate keeps track of the microstates by a 'trace' which in wepy parlance is just a list of indices into the main wepy HDF5 data structure to a specific microstate, i.e. list of (run index, walker index, cycle index) tuples.


In [ ]:
node8_trace = random_state_net.node_assignments(8)
print(node8_trace)

Visualizing Microstates

This is used to make it very easy to visualize the microstates by allowing export to other trajectory software better supported by viewers like mdtraj.


In [ ]:
# get an mdtraj trajectory object from the microstates in a node
node8_traj = random_state_net.state_to_mdtraj(8)
node8_traj.superpose(node8_traj)
print("{} frames in macrostate {}".format(node8_traj.n_frames, 8))
import nglview as nv
view = nv.show_mdtraj(node8_traj)
view

Linking Runs Together

When doing many runs over different jobs and with continuations we found that it would be useful to be able to link them all into a single file and thus not have to worry about different files when doing analysis.

Luckily, the HDF5 standard provides a way to do this and is now incorporated into wepy.hdf5.

The main methods we can use to accomplish this are:

  • clone : copy the header information in a file without run data
  • add_continuation : manually specify a continuation within a file
  • link_run : link a single run from another file, optionally creating a continuation
  • link_file_runs : link all runs from another file, preserving their internal continuations

Example Datasets


In [ ]:
rescale(linking_files_im, 1.0)

In [ ]:
wepy2 = WepyHDF5(wepy2_path, mode='r')
wepy3 = WepyHDF5(wepy3_path, mode='r')
with wepy2:
    print("File 2 runs:", wepy2.run_idxs)
with wepy3:
    print("File 3 runs:", wepy3.run_idxs)

Code:


In [ ]:
# now we are going to link them all under one linker file
linker_h5_path = osp.join(outputs_dir, "all_runs.wepy.h5")
with wepy1:
    all_wepy = wepy1.clone(linker_h5_path, mode='w')

with all_wepy:

    # link the whole file for wepy1 to preserve the continuations
    wepy1_run_idxs = all_wepy.link_file_runs(wepy1_path)

    # do the same for wepy2 just to test it on only one run in a file and
    # because it is easier
    wepy2_run_idxs = all_wepy.link_file_runs(wepy2_path)

    # now we need to link the run from the continuation file for wepy2 and
    # add in the continuation records
    all_wepy.link_run(wepy3_path, 0, continue_run=wepy2_run_idxs[0])
    
    print(all_wepy.spanning_contigs())

In [ ]:
all_wepy.open()
wepy1.open()
wepy2.open()
wepy3.open()

Future Topics

  • Automatic exit point ancestry trees and trajectories
  • Easier restarts and continuations

Requests and Challenges?

  • stripping waters
  • clustering featurization examples
  • linking files with different amount of solvent

TODO

  • wepy paper