There are two parameters to the algorithm, min_samples and eps, which define formally what we mean when we say dense. Higher min_samples or lower eps indicate higher density necessary to form a cluster.
More formally, we define a core sample as being a sample in the dataset such that there exist min_samples other samples within a distance of eps, which are defined as neighbors of the core sample. This tells us that the core sample is in a dense area of the vector space. A cluster is a set of core samples, that can be built by recursively by taking a core sample, finding all of its neighbors that are core samples, finding all of their neighbors that are core samples, and so on. A cluster also has a set of non-core samples, which are samples that are neighbors of a core sample in the cluster but are not themselves core samples. Intuitively, these samples are on the fringes of a cluster.
Any core sample is part of a cluster, by definition. Further, any cluster has at least min_samples points in it, following the definition of a core sample. For any sample that is not a core sample, and does have a distance higher than eps to any core sample, it is considered an outlier by the algorithm.
In [1]:
import glob
import os
import numpy as np
import pandas as pd
np.set_printoptions(suppress=True)
In [2]:
from bokeh.io import output_notebook, show
from bokeh.resources import INLINE
output_notebook(resources=INLINE)
In [3]:
import bokeh.plotting
import bokeh.models # Legend, Range1d
import bokeh.layouts # gridplot
from bokeh.palettes import Dark2_7 as palette
In [4]:
import datashader
import collections
import xarray
from datashader.bokeh_ext import InteractiveImage
Euclidean Distance: $$ d_{ab} = \sqrt{\sum^N_{i=1}(a_i-b_i)^2}$$
Fractional difference used in SasCalc: $$ pd_{ab} = \frac{1}{N}\sum_{i=1}^N{\frac{\left|a_i-b_i\right|}{b_i}}$$
Modified fractional difference: $$ pd_{ab} = \frac{1}{N}\sum_{i=1}^N{\frac{\left|a_i-b_i\right|}{(b_i + a_i)/2}}$$
In [59]:
n = 100
a = 20 * np.ones(n)
b = np.linspace(1, n, n)
d = np.sqrt(np.square(a-b))
pd_ab = np.abs(a-b)/b
pd_ba = np.abs(a-b)/a
mean_ab = (a + b) / 2
pd = np.abs(a-b)/mean_ab
In [63]:
p = bokeh.plotting.figure(height=400)
p.line(b, d, color=palette[0], legend='Euclidean Distance', line_width=3)
p.line(b, pd_ab, color=palette[1], legend='Fractional Difference, AB', line_width=2)
p.line(b, pd_ba, color=palette[2], legend='Fractional Difference, BA', line_width=2)
p.line(b, pd, color=palette[3], legend='Modified Fractional Difference', line_width=2)
bokeh.plotting.show(p)
In [50]:
def calc_pd(a, b):
mean_ab = (a + b) / 2.0
pd = np.abs(a-b) / mean_ab
return pd
In [55]:
calc_pd(2, 3)
Out[55]:
In [56]:
calc_pd(2, 1)
Out[56]:
In [49]:
np.savetxt
Out[49]:
Begin by designating where the theoretical SAXS and SANS data files are located.
In [16]:
import sys
sys.platform
Out[16]:
In [17]:
if 'linux' in sys.platform:
saxs_dir = '/home/schowell/data/scratch/docking/xray'
pr_dir = '/home/schowell/data/scratch/docking/pr'
elif sys.platform == 'darwin':
sas_dir = '/Users/schowell/scratch/docking/sascalc'
pr_dir = '/Users/schowell/scratch/docking/pr'
sas_ext = '*.iq'
saxs_search = os.path.join(saxs_dir, sas_ext)
print(saxs_search)
pr_ext = '*.pr'
pr_search = os.path.join(pr_dir, pr_ext)
print(pr_search)
In [18]:
saxs_files = glob.glob(saxs_search)
pr_files = glob.glob(pr_search)
saxs_files.sort()
pr_files.sort()
n_saxs = len(saxs_files)
n_pr = len(pr_files)
print('Found {} SAXS data files'.format(n_saxs))
print('Found {} P(r) data files'.format(n_pr))
In [27]:
n = 1000
In [19]:
saxs_data = []
# load in the first data set to setup the q-mask
first_data = np.loadtxt(saxs_files[0])
q_mask = first_data[:, 0] <= 0.18 # only use data up to 0.18 1/A
first_data[:, 1] /= first_data[0, 1] # normalize I(0), to 1
first_data = first_data[q_mask]
saxs_data.append(first_data[1:, 1]) # do not use I(0), it is the same for every dataset
# load in the rest of the data
for saxs_file in saxs_files[1:]:
x_data = np.loadtxt(saxs_file)
x_data[:, 1] /= x_data[0, 1]
x_data = x_data[q_mask]
assert np.allclose(x_data[0, 1], first_data[0, 1]), 'ERROR: data not normalized to I(0)'
assert np.allclose(x_data[:, 0], first_data[:, 0]), 'ERROR: data not on same Q-grid'
saxs_data.append(x_data[1:, 1])
saxs_data = np.array(saxs_data)
Store the Q-values
In [20]:
q_saxs = x_data[1:, 0]
print(q_saxs)
In [21]:
pr_data_l = []
n_pr = len(pr_files)
n_r = np.empty(n_pr, dtype=int)
# load in all the data set
for i, pr_file in enumerate(pr_files):
n_data = np.loadtxt(pr_file, delimiter=',', dtype=int)
pr_data_l.append(n_data[:, 1])
n_r[i] = len(n_data)
r_max = n_r.max()
pr_data = np.zeros([n_pr, r_max], dtype=int)
for i, n_data in enumerate(pr_data_l):
pr_data[i, :len(n_data)] = n_data
#pr_data = np.array(sans_data)
In [22]:
pr_data[:3, :8]
Out[22]:
Create a dataframe for plotting purposes (each column will be a different I(Q) or P(r) curve)
In [30]:
n = 1000
In [28]:
saxs_df = pd.DataFrame(data=saxs_data[:n].T)
saxs_cols = ['c{}'.format(c) for c in saxs_df.columns]
saxs_df.columns = saxs_cols
saxs_df['q'] = q_saxs
saxs_df.head()
Out[28]:
In [29]:
pr_df = pd.DataFrame(data=pr_data[:n].T)
pr_cols = ['c{}'.format(c) for c in pr_df.columns]
pr_df.columns = pr_cols
r = np.arange(r_max)
pr_df['r'] = r
pr_df.head()
Out[29]:
In [31]:
q_range = q_saxs.min(), q_saxs.max()
pq_range = saxs_data.min(), saxs_data.max()
In [32]:
r_range = 0, r_max
pr_range = pr_data.min(), pr_data.max()
In [ ]:
%%time
saxs_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range, plot_height=300, plot_width=300)
# create an ordered-dictionary of each line, aggregated
saxs_aggs = collections.OrderedDict((c, saxs_canvas.line(saxs_df, 'q', c)) for c in saxs_cols)
saxs_merged = xarray.concat(saxs_aggs.values(), dim=pd.Index(saxs_cols, name='cols'))
saxs_img = datashader.transfer_functions.shade(saxs_merged.sum(dim='cols'), how='eq_hist')
In [ ]:
%%time
pr_canvas = datashader.Canvas(x_range=r_range, y_range=pr_range, plot_height=300, plot_width=300)
# create an ordered-dictionary of each line, aggregated
pr_aggs = collections.OrderedDict((c, pr_canvas.line(pr_df, 'r', c)) for c in pr_cols)
pr_merged = xarray.concat(pr_aggs.values(), dim=pd.Index(pr_cols, name='cols'))
pr_img = datashader.transfer_functions.shade(pr_merged.sum(dim='cols'), how='eq_hist')
In [33]:
def saxs_image(q_range, pq_range, w, h):
saxs_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range,
plot_height=h, plot_width=w)
saxs_aggs = collections.OrderedDict((c, saxs_canvas.line(saxs_df, 'q', c)) for c in saxs_cols)
saxs_merged = xarray.concat(saxs_aggs.values(), dim=pd.Index(saxs_cols, name='cols'))
saxs_img = datashader.transfer_functions.shade(saxs_merged.sum(dim='cols'), how='eq_hist')
return saxs_img
p = bokeh.plotting.figure(x_range=q_range, y_range=pq_range,
plot_width=400, plot_height=300,
x_axis_label='Q (1/A)', y_axis_label='P(Q)')
InteractiveImage(p, saxs_image)
Out[33]:
In [34]:
def pr_image(r_range, pr_range, w, h):
pr_canvas = datashader.Canvas(x_range=r_range, y_range=pr_range,
plot_height=h, plot_width=w)
pr_aggs = collections.OrderedDict((c, pr_canvas.line(pr_df, 'r', c)) for c in pr_cols)
pr_merged = xarray.concat(pr_aggs.values(), dim=pd.Index(pr_cols, name='cols'))
pr_img = datashader.transfer_functions.shade(pr_merged.sum(dim='cols'), how='eq_hist')
return pr_img
p = bokeh.plotting.figure(x_range=r_range, y_range=pr_range,
plot_width=400, plot_height=300,
x_axis_label='r (A)', y_axis_label='P(r)')
InteractiveImage(p, pr_image)
Out[34]:
In [35]:
n = 10000
In [36]:
saxs_df = pd.DataFrame(data=saxs_data[:n].T)
saxs_cols = ['c{}'.format(c) for c in saxs_df.columns]
saxs_df.columns = saxs_cols
saxs_df['q'] = q_saxs
saxs_df.head()
Out[36]:
In [37]:
pr_df = pd.DataFrame(data=pr_data[:n].T)
pr_cols = ['c{}'.format(c) for c in pr_df.columns]
pr_df.columns = pr_cols
r = np.arange(r_max)
pr_df['r'] = r
pr_df.head()
Out[37]:
In [ ]:
def saxs_image(q_range, pq_range, w, h):
saxs_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range,
plot_height=h, plot_width=w)
saxs_aggs = collections.OrderedDict((c, saxs_canvas.line(saxs_df, 'q', c)) for c in saxs_cols)
saxs_merged = xarray.concat(saxs_aggs.values(), dim=pd.Index(saxs_cols, name='cols'))
saxs_img = datashader.transfer_functions.shade(saxs_merged.sum(dim='cols'), how='eq_hist')
return saxs_img
p = bokeh.plotting.figure(x_range=q_range, y_range=pq_range,
plot_width=400, plot_height=300,
x_axis_label='Q (1/A)', y_axis_label='P(Q)')
InteractiveImage(p, saxs_image)
Out[ ]:
In [ ]:
def pr_image(r_range, pr_range, w, h):
pr_canvas = datashader.Canvas(x_range=r_range, y_range=pr_range,
plot_height=h, plot_width=w)
pr_aggs = collections.OrderedDict((c, pr_canvas.line(pr_df, 'r', c)) for c in pr_cols)
pr_merged = xarray.concat(pr_aggs.values(), dim=pd.Index(pr_cols, name='cols'))
pr_img = datashader.transfer_functions.shade(pr_merged.sum(dim='cols'), how='eq_hist')
return pr_img
p = bokeh.plotting.figure(x_range=r_range, y_range=pr_range,
plot_width=400, plot_height=300,
x_axis_label='r (A)', y_axis_label='P(r)')
InteractiveImage(p, pr_image)
In [ ]:
sans_img
In [ ]:
saxs_img
In [ ]:
pr_img
In [ ]:
n_samples, n_features = saxs_data.shape # for PCA, should be (n_samples, n_features)
print('# samples: {}\n# features: {}'.format(n_samples, n_features))
Each row in the NumPy array is a separate scattering pattern or pair-distance distribution curve. Each column with be a feature we explore.
In [ ]:
print(saxs_data[:3, :5])
print(pr_data[:3, :5])
In [ ]:
min_vals = saxs_data.min(axis=0)
max_vals = saxs_data.max(axis=0)
saxs_range = max_vals - min_vals
print(saxs_range)
Notice how the variation depends significantly on Q.
In [ ]:
min_vals = pr_data.min(axis=0)
max_vals = pr_data.max(axis=0)
pr_range = max_vals - min_vals
print(pr_range)
In [ ]:
range_pq = bokeh.plotting.figure(x_axis_label='Q (1/A)', y_axis_label='P(Q) range',
width=400, height=300, title='Data Range')
range_pr = bokeh.plotting.figure(x_axis_label='r (A)', y_axis_label='P(r) range',
width=400, height=300, title='Data Range')
range_pq.line(q_saxs, saxs_range, legend='SAXS range', color=palette[0])
range_pr.line(r, pr_range, legend='P(c) range', color=palette[3])
range_pr.legend.location = "bottom_center"
plots = bokeh.layouts.gridplot([[range_pq, range_pr]])
show(plots)
In [ ]:
from sklearn.preprocessing import StandardScaler, RobustScaler
saxs_scaler = RobustScaler()
sans_scaler = RobustScaler()
pr_scaler = RobustScaler()
In [ ]:
saxs_scaler.fit(saxs_data)
pr_scaler.fit(pr_data)
In [ ]:
scaled_saxs = saxs_scaler.transform(saxs_data)
scaled_pr = pr_scaler.transform(pr_data)
print(scaled_saxs[:3, :5])
print(scaled_pr[:3, :5])
In [ ]:
min_vals = scaled_saxs.min(axis=0)
max_vals = scaled_saxs.max(axis=0)
saxs_scaled_range = max_vals - min_vals
min_vals = scaled_pr.min(axis=0)
max_vals = scaled_pr.max(axis=0)
pr_scaled_range = max_vals - min_vals
print(saxs_scaled_range)
print(pr_scaled_range)
Notice how rescaling the data using the RobustScaler has removed the $Q$ or $r$ dependence from the ranges.
In [ ]:
range_pq0 = bokeh.plotting.figure(x_axis_label='Q (1/A)', y_axis_label='P(Q) range',
width=400, height=300, title='Range Before')
range_pr0 = bokeh.plotting.figure(x_axis_label='r (A)', y_axis_label='P(r) range',
width=400, height=300, title='Comparison')
range_pq1 = bokeh.plotting.figure(x_axis_label='Q (1/A)', y_axis_label='P(Q) range',
width=400, height=300, title='Comparison')
range_pr1 = bokeh.plotting.figure(x_axis_label='r (A)', y_axis_label='P(r) range',
width=400, height=300, title='Range After')
range_pq0.line(q_saxs, saxs_range, legend='SAXS before', color=palette[0])
range_pq1.line(q_saxs, saxs_range, legend='SAXS before', color=palette[0])
range_pq1.line(q_saxs, saxs_scaled_range, legend='SAXS after', color=palette[0], line_dash= [4, 2])
range_pr0.line(r, pr_range, legend='before', color=palette[3])
range_pr0.line(r, pr_scaled_range, legend='after', color=palette[3], line_dash= [4, 2])
range_pr1.line(r, pr_scaled_range, legend='after', color=palette[3], line_dash= [4, 2])
range_pr0.legend.location = "bottom_center"
range_pr1.legend.location = "top_left"
plots = bokeh.layouts.gridplot([[range_pq0, range_pr0],
[range_pq1, range_pr1]])
show(plots)
Note that for P(r), the last 5 points in almost every data set are zero, so the range did not change at all. See this below
In [ ]:
pr_data[:, -5:]
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
def compare_q1_q2(q1, q2):
plt.figure()
plt.plot(saxs_data[:,q1], saxs_data[:,q2], 'bo', markersize=5)
plt.title('original data')
plt.xlabel(r'$Q_{}$'.format(q1))
plt.ylabel(r'$Q_{}$'.format(q2))
plt.figure()
plt.plot(scaled_saxs[:,q1], scaled_saxs[:,q2], 'bo', markersize=5)
plt.xlabel(r'$Q_{}$'.format(q1))
plt.ylabel(r'$Q_{}$'.format(q2))
plt.title('scaled data')
In [ ]:
compare_q1_q2(0, 8)
In [ ]:
plt.figure()
plt.plot(saxs_data[:,0], saxs_data[:,15], 'bo', markersize=5)
plt.title('original data')
plt.xlabel(r'$Q_{}$'.format(0))
plt.ylabel(r'$Q_{%d}$'%15)
plt.figure()
plt.plot(scaled_saxs[:,0], scaled_saxs[:,17], 'bo', markersize=5)
plt.xlabel(r'$Q_{}$'.format(0))
plt.ylabel(r'$Q_{}$'.format(17))
plt.title('scaled data')
In [ ]:
scaled_saxs.shape
In [ ]:
i0 = 2
i_compare = 0
for i0 in range(18):
plt.figure()
plt.plot(scaled_saxs[:,i0], scaled_saxs[:, i_compare], 'bo')
plt.plot(scaled_saxs[112,i0], scaled_saxs[112, i_compare], 'rs')
plt.plot(scaled_saxs[113,i0], scaled_saxs[113, i_compare], 'gs')
plt.xlabel(r'$Q_{{}}$'.format(i0))
plt.ylabel(r'$Q_{{}}$'.format(i_compare))
In [ ]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
Need to determine what distance, epsilon, to set as the minimum distance for defining core points. To achieve this we will use the $k$-dist plot of the distance to the $k^{th}$ nearest neighbor:
In [ ]:
from scipy.spatial.distance import pdist
In [ ]:
dist = np.zeros([n_samples, n_samples])
i = np.arange(n_samples)
dist[i[:, None] < i] = pdist(scaled_saxs) # get the distances, consider other distance metrics
i_upper = np.triu_indices(n_samples, 1)
i_lower = np.tril_indices(n_samples, -1)
dist[i_lower] = dist[i_upper] # make the matrix symmetric
np.fill_diagonal(dist, np.inf) # fill the diagonal with large value for simplicity
In [ ]:
k_dist = dist.min(axis=1)
k_dist.sort()
In [ ]:
k_dist
In [ ]:
saxs_data
In [ ]:
raw_dist = np.zeros([n_samples, n_samples])
i = np.arange(n_samples)
raw_dist[i[:, None] < i] = pdist(saxs_data) # get the distances, consider other distance metrics
i_upper = np.triu_indices(n_samples, 1)
i_lower = np.tril_indices(n_samples, -1)
raw_dist[i_lower] = raw_dist[i_upper] # make the matrix symmetric
np.fill_diagonal(raw_dist, np.inf) # fill the diagonal with large value for simplicity
In [ ]:
raw_k_dist = raw_dist.min(axis=1)
raw_k_dist.sort()
In [ ]:
raw_k_dist
In [ ]:
k_plot = bokeh.plotting.figure(y_axis_label='distance')
k_plot.line(i, k_dist/k_dist.max(), color=palette[-1], legend='scaled')
k_plot.line(i, raw_k_dist/raw_k_dist.max(), color=palette[0], legend='raw')
'''
n = 16
exponents = np.polyfit(i, raw_k_dist/raw_k_dist.max(), n)
fit = np.poly1d(exponents)
y = fit(i)
k_plot.line(i, y, color=palette[1], legend='fit ({})'.format(n))
d1_norm = smooth_diff_n5(i, raw_k_dist/raw_k_dist.max())
k_plot.line(d1_norm[:, 0], d1_norm[:, 1], color=palette[2], legend='d1')
d1_fit = smooth_diff_n5(i, y)
k_plot.line(d1_fit[:, 0], d1_fit[:, 1], color=palette[3], legend='d1 fit')
d2 = smooth_diff_n5(d1_norm[:, 0], d1_norm[:, 1])
k_plot.line(d2[:, 0], d2[:, 1], color=palette[4], legend='d2')
d2_fit = smooth_diff_n5(d1_fit[:, 0], d1_fit[:, 1])
k_plot.line(d2_fit[:, 0], d2_fit[:, 1], color=palette[5], legend='d2 fit')
#k_plot.line(i[1:-1], d2_dist, color=palette[0], line_dash=[5,2], legend='scaled d2')
# k_plot.line(i[1:-1], d2_k_dist, color=palette[2], line_dash=[10,1], legend='scaled np d2')
# k_plot.line(i[1:], d_raw_k_dist, color=palette[3], line_dash=[10,1], legend='raw d')
# k_plot.line(i[1:-1], d2_raw_dist, color=palette[1], line_dash=[5,2], legend='raw d2')
'''
k_plot.legend.location = "top_left"
bokeh.plotting.show(k_plot)
In [ ]:
x = np.arange(10)
In [ ]:
xx = np.vstack([x,x]).T
In [ ]:
def smooth_diff_n5(x, y):
h_all = x[1:] - x[:-1]
h = h_all[0]
assert np.allclose(h_all, h), 'inconsistent dx'
dy = (2 * (y[3:-1] - y[1:-3]) + y[4:] - y[:-4]) / (8 * h)
dydx = np.vstack([x[2:-2], dy]).T
return dydx
In [ ]:
def smooth_diff_n7(x, y):
h_all = x[1:] - x[:-1]
h = h_all[0]
assert np.allclose(h_all, h), 'inconsistent dx'
dy = (5 * (y[4:-2] - y[2:-4]) + 4 * (y[5:-1] - y[1:-5]) + y[6:] - y[:-6]) / (32 * h)
dydx = np.vstack([x[3:-3], dy]).T
return dydx
In [ ]:
d1_norm = smooth_diff(i, raw_k_dist/raw_k_dist.max())
In [ ]:
d1_raw = smooth_diff(i, raw_k_dist)
In [ ]:
x = np.linspace(0, 2*np.pi)
y = np.sin(x)
dydx = smooth_diff_n5(x, y)
d2ydx2 = smooth_diff_n5(dydx[:, 0], dydx[:, 1])
p = bokeh.plotting.figure()
p.line(x, y, legend='sin(x)', color=palette[0])
p.line(dydx[:, 0], dydx[:, 1], legend='cos(x)', color=palette[1])
p.line(d2ydx2[:, 0], d2ydx2[:, 1], legend='-sin(x)', color=palette[2])
bokeh.plotting.show(p)
In [ ]:
# Compute DBSCAN
## Tune these parameters to adjust cluster size ##
distance = 1 # <--need to explore tuning this parameter
min_samples = 2
##################################################
x_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
x_core_samples_mask = np.zeros_like(x_db.labels_, dtype=bool)
x_core_samples_mask[x_db.core_sample_indices_] = True
saxs_labels = x_db.labels_ + 1 # 0's are independent groups
x_clusters_ = len(set(saxs_labels)) - (1 if -1 in saxs_labels else 0)
n_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
n_core_samples_mask = np.zeros_like(n_db.labels_, dtype=bool)
n_core_samples_mask[n_db.core_sample_indices_] = True
sans_labels = n_db.labels_ + 1 # 0's are independent groups
n_clusters_ = len(set(sans_labels)) - (1 if -1 in sans_labels else 0)
p_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_pr)
p_core_samples_mask = np.zeros_like(p_db.labels_, dtype=bool)
p_core_samples_mask[p_db.core_sample_indices_] = True
pr_labels = p_db.labels_ + 1 # 0's are independent groups
p_clusters_ = len(set(pr_labels)) - (1 if -1 in pr_labels else 0)
In [ ]:
# Compute DBSCAN
## Tune these parameters to adjust cluster size ##
distance = 1 # <--need to explore tuning this parameter
min_samples = 2
##################################################
x_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
x_core_samples_mask = np.zeros_like(x_db.labels_, dtype=bool)
x_core_samples_mask[x_db.core_sample_indices_] = True
saxs_labels = x_db.labels_ + 1 # 0's are independent groups
x_clusters_ = len(set(saxs_labels)) - (1 if -1 in saxs_labels else 0)
n_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
n_core_samples_mask = np.zeros_like(n_db.labels_, dtype=bool)
n_core_samples_mask[n_db.core_sample_indices_] = True
sans_labels = n_db.labels_ + 1 # 0's are independent groups
n_clusters_ = len(set(sans_labels)) - (1 if -1 in sans_labels else 0)
p_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_pr)
p_core_samples_mask = np.zeros_like(p_db.labels_, dtype=bool)
p_core_samples_mask[p_db.core_sample_indices_] = True
pr_labels = p_db.labels_ + 1 # 0's are independent groups
p_clusters_ = len(set(pr_labels)) - (1 if -1 in pr_labels else 0)
In [ ]:
# x-ray clusters
x_unique = set(saxs_labels)
x_unique.remove(0)
print('cluster labels: {}'.format(x_unique))
print('unique clusters: {}'.format(len(x_unique) + list(saxs_labels).count(0)))
for c in set(saxs_labels):
print('{}: {}'.format(c+1, list(saxs_labels).count(c)))
In [ ]:
# neutron clusters
unique = set(sans_labels)
unique.remove(0)
total_clusters = len(unique) + list(sans_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(sans_labels):
print('{}: {}'.format(c, list(sans_labels).count(c)))
In [ ]:
# pr clusters
unique = set(pr_labels)
unique.remove(0)
total_clusters = len(unique) + list(pr_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(pr_labels):
print('{}: {}'.format(c, list(pr_labels).count(c)))
In [ ]:
np.savetxt('saxs_clusters.txt', saxs_labels, fmt='%d')
np.savetxt('sans_clusters.txt', sans_labels, fmt='%d')
np.savetxt('pr_clusters.txt', pr_labels, fmt='%d')
In [ ]:
print(sans_labels)
print(sans_labels.shape)
slabels = np.array(sans_labels, dtype='str')
# print(slabels)
# print(slabels.shape)
In [ ]:
from matplotlib import offsetbox
i_compare = 0
mn = scaled_saxs.min(axis=0)
mx = scaled_saxs.max(axis=0)
# for i0 in range(1):
for i0 in range(18):
plt.figure()
# plot points to make the correct box size
plt.plot(mn[i0], mn[i_compare], 'w.')
plt.plot(mx[i0], mx[i_compare], 'w.')
for j in range(len(scaled_saxs)):
if slabels[j] != '0':
plt.text(scaled_saxs[j, i0], scaled_saxs[j, i_compare], slabels[j],
fontdict={'weight': 'bold', 'size': 15},
color='r') # plt.cm.Set1(labels[i]/10.0))
else:
plt.plot(scaled_saxs[j, i0], scaled_saxs[j, i_compare], 'k.',
markersize=5)
plt.xlabel(r'$Q_{}$'.format(i0))
plt.ylabel(r'$Q_{}$'.format(i_compare))
In [ ]:
import hdbscan
In [ ]:
# Compute DBSCAN
## Tune these parameters to adjust cluster size ##
distance = 1 # <--need to explore tuning this parameter
min_samples = 2
##################################################
x_db = hdbscan.HDBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
x_core_samples_mask = np.zeros_like(x_db.labels_, dtype=bool)
x_core_samples_mask[x_db.core_sample_indices_] = True
saxs_labels = x_db.labels_ + 1 # 0's are independent groups
x_clusters_ = len(set(saxs_labels)) - (1 if -1 in saxs_labels else 0)
n_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
n_core_samples_mask = np.zeros_like(n_db.labels_, dtype=bool)
n_core_samples_mask[n_db.core_sample_indices_] = True
sans_labels = n_db.labels_ + 1 # 0's are independent groups
n_clusters_ = len(set(sans_labels)) - (1 if -1 in sans_labels else 0)
p_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_pr)
p_core_samples_mask = np.zeros_like(p_db.labels_, dtype=bool)
p_core_samples_mask[p_db.core_sample_indices_] = True
pr_labels = p_db.labels_ + 1 # 0's are independent groups
p_clusters_ = len(set(pr_labels)) - (1 if -1 in pr_labels else 0)
In [ ]:
# x-ray clusters
x_unique = set(saxs_labels)
x_unique.remove(0)
print('cluster labels: {}'.format(x_unique))
print('unique clusters: {}'.format(len(x_unique) + list(saxs_labels).count(0)))
for c in set(saxs_labels):
print('{}: {}'.format(c, list(saxs_labels).count(c)))
In [ ]:
# neutron clusters
unique = set(sans_labels)
unique.remove(0)
total_clusters = len(unique) + list(sans_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(sans_labels):
print('{}: {}'.format(c, list(sans_labels).count(c)))
In [ ]:
# pr clusters
unique = set(pr_labels)
unique.remove(0)
total_clusters = len(unique) + list(pr_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(pr_labels):
print('{}: {}'.format(c, list(pr_labels).count(c)))
In [ ]:
np.savetxt('saxs_clusters.txt', saxs_labels, fmt='%d')
np.savetxt('sans_clusters.txt', sans_labels, fmt='%d')
np.savetxt('pr_clusters.txt', pr_labels, fmt='%d')
In [ ]:
print(sans_labels)
print(sans_labels.shape)
slabels = np.array(sans_labels, dtype='str')
# print(slabels)
# print(slabels.shape)
In [ ]:
from matplotlib import offsetbox
i_compare = 0
mn = scaled_saxs.min(axis=0)
mx = scaled_saxs.max(axis=0)
# for i0 in range(1):
for i0 in range(18):
plt.figure()
# plot points to make the correct box size
plt.plot(mn[i0], mn[i_compare], 'w.')
plt.plot(mx[i0], mx[i_compare], 'w.')
for j in range(len(scaled_saxs)):
if slabels[j] != '0':
plt.text(scaled_saxs[j, i0], scaled_saxs[j, i_compare], slabels[j],
fontdict={'weight': 'bold', 'size': 15},
color='r') # plt.cm.Set1(labels[i]/10.0))
else:
plt.plot(scaled_saxs[j, i0], scaled_saxs[j, i_compare], 'k.',
markersize=5)
plt.xlabel(r'$Q_{}$'.format(i0))
plt.ylabel(r'$Q_{}$'.format(i_compare))
In [ ]:
import sasmol.sasmol as sasmol
dcd_fname = glob.glob('*.dcd')
assert len(dcd_fname) == 1, 'ERROR: unsure which dcd file to use: {}'.format(dcd_fname)
dcd_fname = dcd_fname[0]
pdb_fname = glob.glob('*.pdb')
assert len(pdb_fname) == 1, 'ERROR: unsure which dcd file to use: {}'.format(pdb_fname)
pdb_fname = pdb_fname[0]
In [ ]:
mol = sasmol.SasMol(0)
mol.read_pdb(pdb_fname)
In [ ]:
if not np.alltrue(sans_labels == saxs_labels):
print('WARNING: labels do not match\nusing neutron labels')
labels = sans_labels
In [ ]:
In [ ]:
dcd_fname
In [ ]:
# create a dcd for every cluster with >1 frame
dcd_fnames = []
cluster_out_files = [] # dcds for clusters
unique_out_fname = '{}_uniue.dcd'.format(dcd_fname[:-4])
dcd_out_file = mol.open_dcd_write(unique_out_fname) # dcd file for unique structures
dcd_in_file = mol.open_dcd_read(dcd_fname)
for i in xrange(len(unique)):
dcd_fnames.append('{}_c{:02d}.dcd'.format(dcd_fname[:-4], i))
cluster_out_files.append(mol.open_dcd_write(dcd_fnames[i]))
visited_cluster = set()
dcd_out_frame = 0
cluster_out_frame = np.zeros(len(unique), dtype=int)
for (i, label) in enumerate(labels):
mol.read_dcd_step(dcd_in_file, i)
if label == 0:
dcd_out_frame += 1
mol.write_dcd_step(dcd_out_file, 0, dcd_out_frame)
else:
cluster_out_frame[label-1] += 1
# print('adding frame to cluster {}'.format(label-1))
# print(cluster_out_frame)
mol.write_dcd_step(cluster_out_files[label-1], 0, cluster_out_frame[label-1])
if label not in visited_cluster:
visited.add(label)
dcd_out_frame += 1
mol.write_dcd_step(dcd_out_file, 0, dcd_out_frame)
for cluster_out_file in cluster_out_files:
mol.close_dcd_write(cluster_out_file)
mol.close_dcd_write(dcd_out_file)
mol.close_dcd_read(dcd_in_file[0])
In [ ]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
In [ ]:
pca_orig = PCA()
pca_orig.fit(saxs_data)
In [ ]:
pca_scaled = PCA()
pca_scaled.fit(scaled_saxs)
In [ ]:
print(pca_orig.explained_variance_ratio_)
print(pca_scaled.explained_variance_ratio_)
In [ ]:
plt.figure()
plt.plot(q_values, pca_orig.explained_variance_ratio_, 'o', label='unscaled')
plt.plot(q_values, pca_scaled.explained_variance_ratio_, 's', label='scaled')
plt.legend()
In [ ]:
In [ ]:
from sklearn.datasets.samples_generator import make_blobs
In [ ]:
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0)
In [ ]:
X_scaled = StandardScaler().fit_transform(X)
In [ ]:
X_range = X.max(axis=0) - X.min(axis=0)
print(X_range)
In [ ]:
saxs_scaled_range = X_scaled.max(axis=0) - X_scaled.min(axis=0)
print(saxs_scaled_range)
In [ ]:
X_s2 = StandardScaler().fit_transform(X)
In [ ]:
X_s2_range = X_s2.max(axis=0) - X_s2.min(axis=0)
print(X_s2_range)
In [ ]:
In [ ]: