In [1]:
from glob import glob
import numpy as np
from tqdm import tqdm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# featurized trajectories
path = "/Users/joshuafass/abl_backbone_torsions/*.npy"
fnames = glob(path)
In [3]:
def parse_run_and_clone_from_fname(fname):
"""Return a tuple of integers from the filename string"""
actual_fname = fname.split("/")[-1]
fname_after_run = actual_fname[3:]
run_string, clone_string_ = fname_after_run.split("-")
clone_string = clone_string_.split("_")[0][len("clone"):]
return int(run_string), int(clone_string)
parse_run_and_clone_from_fname(fnames[0])
Out[3]:
In [4]:
# load the data and labels
Xs = []
ys = []
for fname in tqdm(fnames):
run, clone = parse_run_and_clone_from_fname(fname)
Xs.append(np.load(fname)[::5]) # for "training", taking every 5th frame
ys.append(np.ones(len(Xs[-1])) * run)
X = np.vstack(Xs)
y = np.hstack(ys)
X.shape, y.shape, len(set(y))
Out[4]:
In [5]:
# get the number of frames per RUN
frames_per_RUN = np.zeros(max(y)+1)
for fname in tqdm(fnames):
run, clone = parse_run_and_clone_from_fname(fname)
n_frames = len(np.load(fname))
frames_per_RUN[run] += n_frames
In [6]:
lda = LinearDiscriminantAnalysis()
In [7]:
lda.fit(X, y)
Out[7]:
In [8]:
lda.score(X,y)
Out[8]:
In [9]:
y_ = lda.predict(X)
In [10]:
# see how well the learned linear classifier can separate trajectories by initial condition
c_m = confusion_matrix(y, y_)
plt.imshow(np.log(c_m), interpolation="none", cmap="Blues")
Out[10]:
In [11]:
# classification accuracy
1.0 * np.trace(c_m) / np.sum(c_m)
Out[11]:
In [12]:
# fraction of nonzero matrix elements
(1.0 * np.sum(c_m!=0)) / np.prod(c_m.shape)
Out[12]:
In [13]:
def subtract_diag(matrix):
return matrix - np.diag(np.diag(matrix))
In [14]:
# I guess, we can ask which RUNs are connected to at least one other RUN?
In [15]:
subtract_diag(c_m).sum(0)
Out[15]:
In [16]:
# how many RUNs are "disconnected" from all others?
sum(subtract_diag(c_m).sum(0) == 0), sum(subtract_diag(c_m).sum(0) != 0)
Out[16]:
In [17]:
def n_counts_in_set(run_set):
"""Compute the number of frames in a set of RUN indices"""
return sum([frames_per_RUN[run] for run in run_set])
In [18]:
# next, we can identify the largest connected component?
import networkx as nx
graph = nx.DiGraph(c_m > 10)
(100 * sorted([n_counts_in_set(g) for g in nx.strongly_connected_components(graph)])[::-1] / np.sum(frames_per_RUN))
Out[18]:
In [19]:
# As we increase the threshold, what happens?
def get_percent_counts_in_largest_component(c_m, threshold=1):
graph = nx.DiGraph(c_m >= threshold)
return (100 * sorted([n_counts_in_set(g) for g in nx.strongly_connected_components(graph)])[-1] / np.sum(frames_per_RUN))
def plot_cc_size_as_fxn_of_threshold(c_m, frames_per_class=frames_per_RUN):
thresholds = np.arange(1,500)
cc_sizes = [get_percent_counts_in_largest_component(c_m, threshold) for threshold in tqdm(thresholds)]
plt.plot(thresholds, cc_sizes)
percent_counts_in_largest_class = 100.0 * np.max(frames_per_class) / np.sum(frames_per_class)
plt.hlines(percent_counts_in_largest_class, thresholds[0], thresholds[-1], linestyle='--')
plt.ylim(0, max(cc_sizes))
plt.ylabel("% counts in largest strongly connected component")
plt.xlabel("Threshold")
In [20]:
# now, let's do this for the full dataset, using the LDA model learned on a small subset of the data
y_full_true = []
y_full_pred = []
for fname in tqdm(fnames):
run, clone = parse_run_and_clone_from_fname(fname)
x = np.load(fname)
y_full_pred.append(lda.predict(x))
y_full_true.append(np.ones(len(x))*run)
In [21]:
y_full_true = np.hstack(y_full_true)
y_full_pred = np.hstack(y_full_pred)
In [22]:
c_m = confusion_matrix(y_true=y_full_true, y_pred=y_full_pred)
In [23]:
plt.imshow(np.log(c_m), interpolation="none", cmap="Blues")
Out[23]:
In [24]:
plot_cc_size_as_fxn_of_threshold(c_m)
In [25]:
# which RUNs are strongly connected components
graph = nx.DiGraph(c_m > 10)
[g for g in nx.strongly_connected_components(graph)]
Out[25]:
In [26]:
def multimodal_potential(x):
return 5.0 * np.sum(np.sin(10 * x)) + np.sum(x**4)
def q(x):
return np.exp(-multimodal_potential(x))
def rw_metropolis_hastings(x0, q, proposal_stdev=0.1, n_steps=1000):
xs = np.zeros((n_steps, len(x0)))
xs[0] = x0
# draw all the random numbers we'll need
proposal_eps = proposal_stdev * np.random.randn(n_steps, len(x0)) # standard normal
accept_eps = np.random.rand(n_steps) # uniform(0,1)
for i in range(1, n_steps):
x_prop = xs[i-1] + proposal_eps[i]
a_r_ratio = q(x_prop) / q(xs[i-1])
# accept / reject
if a_r_ratio > accept_eps[i]:
xs[i] = x_prop
else:
xs[i] = xs[i-1]
return xs
np.random.seed(0)
n_trajectories = 100
starting_conditions = [np.random.randn(2)*2 for _ in range(n_trajectories)]
trajectories = []
for x0 in tqdm(starting_conditions):
trajectories.append(rw_metropolis_hastings(x0, q)[100:])
In [27]:
for traj in trajectories:
plt.plot(traj[:,0], traj[:,1])
plt.xlim(-3,3)
plt.ylim(-3,3)
Out[27]:
In [28]:
X_2d = np.vstack(trajectories)
y_2d = np.hstack([i * np.ones(len(trajectories[i])) for i in range(len(trajectories))])
In [29]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_2d, y_2d)
y_pred_2d = lda.predict(X_2d)
In [30]:
c_m = confusion_matrix(y_true=y_2d, y_pred=y_pred_2d)
plt.imshow(np.log(c_m), interpolation="none", cmap="Blues")
Out[30]:
In [31]:
graph = nx.DiGraph(c_m > 10)
ccs = [g for g in nx.strongly_connected_components(graph)]
sorted(ccs, key=len)[::-1]
Out[31]:
In [32]:
largest_cc = sorted(ccs, key=len)[-1]
largest_cc
Out[32]:
In [33]:
plot_cc_size_as_fxn_of_threshold(c_m, [len(traj) for traj in trajectories])
In [34]:
# color inclusion in largest connected component
for i, traj in enumerate(trajectories):
if i in largest_cc:
c = "blue"
alpha = 1.0
else:
c = "grey"
#c = None
alpha = 0.1
plt.plot(traj[:,0], traj[:,1], c=c, alpha=alpha)
plt.xlim(-3,3)
plt.ylim(-3,3)
plt.title("Identified active set")
Out[34]:
In [ ]: