There are two parameters to the algorithm, min_samples and eps, which define formally what we mean when we say dense. Higher min_samples or lower eps indicate higher density necessary to form a cluster.
More formally, we define a core sample as being a sample in the dataset such that there exist min_samples other samples within a distance of eps, which are defined as neighbors of the core sample. This tells us that the core sample is in a dense area of the vector space. A cluster is a set of core samples, that can be built by recursively by taking a core sample, finding all of its neighbors that are core samples, finding all of their neighbors that are core samples, and so on. A cluster also has a set of non-core samples, which are samples that are neighbors of a core sample in the cluster but are not themselves core samples. Intuitively, these samples are on the fringes of a cluster.
Any core sample is part of a cluster, by definition. Further, any cluster has at least min_samples points in it, following the definition of a core sample. For any sample that is not a core sample, and does have a distance higher than eps to any core sample, it is considered an outlier by the algorithm.
In [6]:
import glob
import os
import numpy as np
import pandas as pd
np.set_printoptions(suppress=True)
In [7]:
from bokeh.io import output_notebook, show
from bokeh.resources import INLINE
output_notebook(resources=INLINE)
In [8]:
import bokeh.plotting
import bokeh.models # Legend, Range1d
import bokeh.layouts # gridplot
from bokeh.palettes import Colorblind7 as palette
In [9]:
import datashader
import collections
import xarray
from datashader.bokeh_ext import InteractiveImage
Begin by designating where the theoretical SAXS and SANS data files are located.
In [10]:
import sys
sys.platform
Out[10]:
In [11]:
if sys.platform == 'linux':
sas_dir = '/home/schowell/data/scratch/sas_clustering/sascalc'
pr_dir = '/home/schowell/data/scratch/sas_clustering/pr'
elif sys.platform == 'darwin':
sas_dir = '/Users/schowell/scratch/sas_clustering/sascalc'
pr_dir = '/Users/schowell/scratch/sas_clustering/pr'
saxs_dir = 'xray'
sans_dir = 'neutron_D2Op_100'
sas_ext = '*.iq'
saxs_search = os.path.join(sas_dir, saxs_dir, sas_ext)
sans_search = os.path.join(sas_dir, sans_dir, sas_ext)
print(saxs_search)
print(sans_search)
pr_ext = '*.pr'
pr_search = os.path.join(pr_dir, pr_ext)
print(pr_search)
In [12]:
saxs_files = glob.glob(saxs_search)
sans_files = glob.glob(sans_search)
pr_files = glob.glob(pr_search)
saxs_files.sort()
sans_files.sort()
pr_files.sort()
n_saxs = len(saxs_files)
n_sans = len(sans_files)
n_pr = len(pr_files)
print('Found {} SAXS data files'.format(n_saxs))
print('Found {} SANS data files'.format(n_sans))
print('Found {} P(r) data files'.format(n_pr))
In [13]:
saxs_data = []
# load in the first data set to setup the q-mask
first_data = np.loadtxt(saxs_files[0])
q_mask = first_data[:, 0] <= 0.18 # only use data up to 0.18 1/A
first_data[:, 1] /= first_data[0, 1] # normalize I(0), to 1
first_data = first_data[q_mask]
saxs_data.append(first_data[1:, 1]) # do not use I(0), it is the same for every dataset
# load in the rest of the data
for saxs_file in saxs_files[1:]:
x_data = np.loadtxt(saxs_file)
x_data[:, 1] /= x_data[0, 1]
x_data = x_data[q_mask]
assert np.allclose(x_data[0, 1], first_data[0, 1]), 'ERROR: data not normalized to I(0)'
assert np.allclose(x_data[:, 0], first_data[:, 0]), 'ERROR: data not on same Q-grid'
saxs_data.append(x_data[1:, 1])
saxs_data = np.array(saxs_data)
In [14]:
sans_data = []
# load in the first data set to setup the q-mask
first_data = np.loadtxt(sans_files[0])
q_mask = first_data[:, 0] <= 0.18 # only use data up to 0.18 1/A
first_data[:, 1] /= first_data[0, 1] # normalize I(0), to 1
first_data = first_data[q_mask]
sans_data.append(first_data[1:, 1]) # do not use I(0), it is the same for every dataset
# load in the rest of the data
for sans_file in sans_files[1:]:
n_data = np.loadtxt(sans_file)
n_data[:, 1] /= n_data[0, 1]
n_data = n_data[q_mask]
assert np.allclose(n_data[0, 1], first_data[0, 1]), 'ERROR: data not normalized'
assert np.allclose(n_data[:, 0], first_data[:, 0]), 'ERROR: data not on same Q-grid'
sans_data.append(n_data[1:, 1])
sans_data = np.array(sans_data)
Store the Q-values
In [15]:
q_saxs = x_data[1:, 0]
q_sans = n_data[1:, 0]
print(q_saxs)
print(q_sans)
In [16]:
np.allclose(q_saxs, q_sans)
Out[16]:
In [17]:
pr_data_l = []
n_pr = len(pr_files)
n_r = np.empty(n_pr, dtype=int)
# load in all the data set
for i, pr_file in enumerate(pr_files):
n_data = np.loadtxt(pr_file, delimiter=',', dtype=int)
pr_data_l.append(n_data[:, 1])
n_r[i] = len(n_data)
r_max = n_r.max()
pr_data = np.zeros([n_pr, r_max], dtype=int)
for i, n_data in enumerate(pr_data_l):
pr_data[i, :len(n_data)] = n_data
#pr_data = np.array(sans_data)
In [18]:
pr_data[:3, :8]
Out[18]:
Create a dataframe for plotting purposes (each column will be a different I(Q) or P(r) curve)
In [19]:
saxs_df = pd.DataFrame(data=saxs_data.T)
saxs_cols = ['c{}'.format(c) for c in saxs_df.columns]
saxs_df.columns = saxs_cols
saxs_df['q'] = q_saxs
saxs_df.head()
Out[19]:
In [20]:
sans_df = pd.DataFrame(data=sans_data.T)
sans_cols = ['c{}'.format(c) for c in sans_df.columns]
sans_df.columns = sans_cols
sans_df['q'] = q_sans
sans_df.head()
Out[20]:
In [21]:
pr_df = pd.DataFrame(data=pr_data.T)
pr_cols = ['c{}'.format(c) for c in pr_df.columns]
pr_df.columns = pr_cols
r = np.arange(r_max)
pr_df['r'] = r
pr_df.head()
Out[21]:
In [22]:
q_range = q_saxs.min(), q_saxs.max()
pq_range = saxs_data.min(), saxs_data.max()
r_range = 0, r_max
pr_range = pr_data.min(), pr_data.max()
In [28]:
%%time
sans_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range, plot_height=300, plot_width=300)
# create an ordered-dictionary of each line, aggregated
sans_aggs = collections.OrderedDict((c, sans_canvas.line(sans_df, 'q', c)) for c in sans_cols)
sans_merged = xarray.concat(sans_aggs.values(), dim=pd.Index(sans_cols, name='cols'))
sans_img = datashader.transfer_functions.shade(sans_merged.sum(dim='cols'), how='eq_hist')
In [29]:
%%time
saxs_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range, plot_height=300, plot_width=300)
# create an ordered-dictionary of each line, aggregated
saxs_aggs = collections.OrderedDict((c, saxs_canvas.line(saxs_df, 'q', c)) for c in saxs_cols)
saxs_merged = xarray.concat(saxs_aggs.values(), dim=pd.Index(saxs_cols, name='cols'))
saxs_img = datashader.transfer_functions.shade(saxs_merged.sum(dim='cols'), how='eq_hist')
In [30]:
%%time
pr_canvas = datashader.Canvas(x_range=r_range, y_range=pr_range, plot_height=300, plot_width=300)
# create an ordered-dictionary of each line, aggregated
pr_aggs = collections.OrderedDict((c, pr_canvas.line(pr_df, 'r', c)) for c in pr_cols)
pr_merged = xarray.concat(pr_aggs.values(), dim=pd.Index(pr_cols, name='cols'))
pr_img = datashader.transfer_functions.shade(pr_merged.sum(dim='cols'), how='eq_hist')
In [172]:
def saxs_image(q_range, pq_range, w, h):
saxs_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range,
plot_height=h, plot_width=w)
saxs_aggs = collections.OrderedDict((c, saxs_canvas.line(saxs_df, 'q', c)) for c in saxs_cols)
saxs_merged = xarray.concat(saxs_aggs.values(), dim=pd.Index(saxs_cols, name='cols'))
saxs_img = datashader.transfer_functions.shade(saxs_merged.sum(dim='cols'), how='eq_hist')
return saxs_img
p = bokeh.plotting.figure(x_range=q_range, y_range=pq_range,
plot_width=400, plot_height=300,
x_axis_label='Q (1/A)', y_axis_label='P(Q)')
InteractiveImage(p, saxs_image)
Out[172]:
In [173]:
def sans_image(q_range, pq_range, w, h):
sans_canvas = datashader.Canvas(x_range=q_range, y_range=pq_range,
plot_height=h, plot_width=w)
sans_aggs = collections.OrderedDict((c, sans_canvas.line(saxs_df, 'q', c)) for c in sans_cols)
sans_merged = xarray.concat(sans_aggs.values(), dim=pd.Index(sans_cols, name='cols'))
sans_img = datashader.transfer_functions.shade(sans_merged.sum(dim='cols'), how='eq_hist')
return sans_img
p = bokeh.plotting.figure(x_range=q_range, y_range=pq_range,
plot_width=400, plot_height=300,
x_axis_label='Q (1/A)', y_axis_label='P(Q)')
InteractiveImage(p, sans_image)
Out[173]:
In [33]:
def pr_image(r_range, pr_range, w, h):
pr_canvas = datashader.Canvas(x_range=r_range, y_range=pr_range,
plot_height=h, plot_width=w)
pr_aggs = collections.OrderedDict((c, pr_canvas.line(pr_df, 'r', c)) for c in pr_cols)
pr_merged = xarray.concat(pr_aggs.values(), dim=pd.Index(pr_cols, name='cols'))
pr_img = datashader.transfer_functions.shade(pr_merged.sum(dim='cols'), how='eq_hist')
return pr_img
p = bokeh.plotting.figure(x_range=r_range, y_range=pr_range,
plot_width=400, plot_height=300,
x_axis_label='r (A)', y_axis_label='P(r)')
InteractiveImage(p, pr_image)
Out[33]:
In [34]:
sans_img
Out[34]:
In [35]:
saxs_img
Out[35]:
In [36]:
pr_img
Out[36]:
In [23]:
n_samples, n_features = saxs_data.shape # for PCA, should be (n_samples, n_features)
print('# samples: {}\n# features: {}'.format(n_samples, n_features))
Each row in the NumPy array is a separate scattering pattern or pair-distance distribution curve. Each column with be a feature we explore.
In [24]:
print(saxs_data[:3, :5])
print(sans_data[:3, :5])
print(pr_data[:3, :5])
In [25]:
min_vals = saxs_data.min(axis=0)
max_vals = saxs_data.max(axis=0)
saxs_range = max_vals - min_vals
print(saxs_range)
In [26]:
min_vals = sans_data.min(axis=0)
max_vals = sans_data.max(axis=0)
sans_range = max_vals - min_vals
print(sans_range)
Notice how the variation depends significantly on Q.
In [27]:
min_vals = pr_data.min(axis=0)
max_vals = pr_data.max(axis=0)
pr_range = max_vals - min_vals
print(pr_range)
In [179]:
range_pq = bokeh.plotting.figure(x_axis_label='Q (1/A)', y_axis_label='P(Q) range',
width=400, height=300, title='Data Range')
range_pr = bokeh.plotting.figure(x_axis_label='r (A)', y_axis_label='P(r) range',
width=400, height=300, title='Data Range')
range_pq.line(q_saxs, saxs_range, legend='SAXS range', color=palette[0])
range_pq.line(q_sans, sans_range, legend='SANS range', color=palette[1])
range_pr.line(r, pr_range, legend='P(c) range', color=palette[3])
range_pr.legend.location = "bottom_center"
plots = bokeh.layouts.gridplot([[range_pq, range_pr]])
show(plots)
In [180]:
from sklearn.preprocessing import StandardScaler, RobustScaler
saxs_scaler = RobustScaler()
sans_scaler = RobustScaler()
pr_scaler = RobustScaler()
In [181]:
saxs_scaler.fit(saxs_data)
sans_scaler.fit(sans_data)
pr_scaler.fit(pr_data)
Out[181]:
In [3]:
import sklearn.preprocessing
In [4]:
scaler = sklearn.preprocessing.RobustScaler
In [182]:
scaled_saxs = saxs_scaler.transform(saxs_data)
scaled_sans = sans_scaler.transform(sans_data)
scaled_pr = pr_scaler.transform(pr_data)
print(scaled_saxs[:3, :5])
print(scaled_sans[:3, :5])
print(scaled_pr[:3, :5])
In [183]:
min_vals = scaled_saxs.min(axis=0)
max_vals = scaled_saxs.max(axis=0)
saxs_scaled_range = max_vals - min_vals
min_vals = scaled_sans.min(axis=0)
max_vals = scaled_sans.max(axis=0)
sans_scaled_range = max_vals - min_vals
min_vals = scaled_pr.min(axis=0)
max_vals = scaled_pr.max(axis=0)
pr_scaled_range = max_vals - min_vals
print(saxs_scaled_range)
print(sans_scaled_range)
print(pr_scaled_range)
Notice how rescaling the data using the RobustScaler has removed the $Q$ or $r$ dependence from the ranges.
In [184]:
range_pq0 = bokeh.plotting.figure(x_axis_label='Q (1/A)', y_axis_label='P(Q) range',
width=400, height=300, title='Range Before')
range_pr0 = bokeh.plotting.figure(x_axis_label='r (A)', y_axis_label='P(r) range',
width=400, height=300, title='Comparison')
range_pq1 = bokeh.plotting.figure(x_axis_label='Q (1/A)', y_axis_label='P(Q) range',
width=400, height=300, title='Comparison')
range_pr1 = bokeh.plotting.figure(x_axis_label='r (A)', y_axis_label='P(r) range',
width=400, height=300, title='Range After')
range_pq0.line(q_saxs, saxs_range, legend='SAXS before', color=palette[0])
range_pq0.line(q_sans, sans_range, legend='SANS before', color=palette[1])
range_pq1.line(q_saxs, saxs_range, legend='SAXS before', color=palette[0])
range_pq1.line(q_sans, sans_range, legend='SANS before', color=palette[1])
range_pq1.line(q_saxs, saxs_scaled_range, legend='SAXS after', color=palette[0], line_dash= [4, 2])
range_pq1.line(q_sans, sans_scaled_range, legend='SANS after', color=palette[1], line_dash= [4, 2])
range_pr0.line(r, pr_range, legend='before', color=palette[3])
range_pr0.line(r, pr_scaled_range, legend='after', color=palette[3], line_dash= [4, 2])
range_pr1.line(r, pr_scaled_range, legend='after', color=palette[3], line_dash= [4, 2])
range_pr0.legend.location = "bottom_center"
range_pr1.legend.location = "top_left"
plots = bokeh.layouts.gridplot([[range_pq0, range_pr0],
[range_pq1, range_pr1]])
show(plots)
Note that for P(r), the last 5 points in almost every data set are zero, so the range did not change at all. See this below
In [48]:
pr_data[:, -5:]
Out[48]:
In [49]:
import matplotlib.pyplot as plt
%matplotlib inline
In [50]:
def compare_q1_q2(q1, q2):
plt.figure()
plt.plot(saxs_data[:,q1], saxs_data[:,q2], 'bo', markersize=5)
plt.title('original data')
plt.xlabel(r'$Q_{}$'.format(q1))
plt.ylabel(r'$Q_{}$'.format(q2))
plt.figure()
plt.plot(scaled_saxs[:,q1], scaled_saxs[:,q2], 'bo', markersize=5)
plt.xlabel(r'$Q_{}$'.format(q1))
plt.ylabel(r'$Q_{}$'.format(q2))
plt.title('scaled data')
In [51]:
compare_q1_q2(0, 8)
In [52]:
plt.figure()
plt.plot(saxs_data[:,0], saxs_data[:,15], 'bo', markersize=5)
plt.title('original data')
plt.xlabel(r'$Q_{}$'.format(0))
plt.ylabel(r'$Q_{%d}$'%15)
plt.figure()
plt.plot(scaled_saxs[:,0], scaled_saxs[:,17], 'bo', markersize=5)
plt.xlabel(r'$Q_{}$'.format(0))
plt.ylabel(r'$Q_{}$'.format(17))
plt.title('scaled data')
Out[52]:
In [53]:
scaled_saxs.shape
Out[53]:
In [54]:
i0 = 2
i_compare = 0
for i0 in range(18):
plt.figure()
plt.plot(scaled_saxs[:,i0], scaled_saxs[:, i_compare], 'bo')
plt.plot(scaled_saxs[112,i0], scaled_saxs[112, i_compare], 'rs')
plt.plot(scaled_saxs[113,i0], scaled_saxs[113, i_compare], 'gs')
plt.xlabel(r'$Q_{{}}$'.format(i0))
plt.ylabel(r'$Q_{{}}$'.format(i_compare))
In [185]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
Need to determine what distance, epsilon, to set as the minimum distance for defining core points. To achieve this we will use the $k$-dist plot of the distance to the $k^{th}$ nearest neighbor:
In [32]:
from scipy.spatial.distance import pdist, cdist
In [131]:
scaled_saxs
In [132]:
dist = np.zeros([n_samples, n_samples])
i = np.arange(n_samples)
dist[i[:, None] < i] = pdist(scaled_saxs) # get the distances, consider other distance metrics
i_upper = np.triu_indices(n_samples, 1)
i_lower = np.tril_indices(n_samples, -1)
dist[i_lower] = dist[i_upper] # make the matrix symmetric
np.fill_diagonal(dist, np.inf) # fill the diagonal with large value for simplicity
In [133]:
k_dist = dist.min(axis=1)
k_dist.sort()
In [134]:
k_dist[:10]
Out[134]:
In [135]:
raw_dist = np.zeros([n_samples, n_samples])
i = np.arange(n_samples)
raw_dist[i[:, None] < i] = pdist(saxs_data) # get the distances, consider other distance metrics
i_upper = np.triu_indices(n_samples, 1)
i_lower = np.tril_indices(n_samples, -1)
raw_dist[i_lower] = raw_dist.T[i_lower] # make the matrix symmetric
assert np.allclose(raw_dist.T, raw_dist)
np.fill_diagonal(raw_dist, np.inf) # fill the diagonal with large value for simplicity
In [136]:
raw_k_dist = raw_dist.min(axis=0)
raw_k_dist.sort()
In [137]:
raw_k_dist.shape
Out[137]:
In [138]:
raw_k_dist[:10]
Out[138]:
In [139]:
raw_dist[:, 4].min()
Out[139]:
In [140]:
raw_dist[:5, :5]
Out[140]:
In [141]:
raw_dist[4, :].min()
Out[141]:
In [142]:
raw_dist0 = np.zeros(n_samples)
ind = np.arange(n_samples)
for i in ind:
dist = cdist(saxs_data[i].reshape([1, -1]), saxs_data[ind!=i])
raw_dist0[i] = dist.min()
In [143]:
raw_dist0.sort()
raw_dist0[:10]
Out[143]:
In [144]:
cdist(saxs_data[4].reshape([1, -1]), saxs_data[ind!=4]).min()
Out[144]:
In [145]:
raw_k_dist-raw_dist0
Out[145]:
In [146]:
k_plot = bokeh.plotting.figure(y_axis_label='distance')
k_plot.line(ind, raw_k_dist, color=palette[0],
line_dash=[4,2], legend='pdist')
k_plot.line(ind, raw_dist0, color=palette[1],
line_dash=[8,4], legend='cdist')
k_plot.legend.location = "top_left"
bokeh.plotting.show(k_plot)
In [148]:
def smooth_diff_n5(x, y):
h_all = x[1:] - x[:-1]
h = h_all[0]
assert np.allclose(h_all, h), 'inconsistent dx'
dy = (2 * (y[3:-1] - y[1:-3]) + y[4:] - y[:-4]) / (8 * h)
dydx = np.vstack([x[2:-2], dy]).T
return dydx
In [149]:
def smooth_diff_n7(x, y):
h_all = x[1:] - x[:-1]
h = h_all[0]
assert np.allclose(h_all, h), 'inconsistent dx'
dy = (5 * (y[4:-2] - y[2:-4]) + 4 * (y[5:-1] - y[1:-5]) + y[6:] - y[:-6]) / (32 * h)
dydx = np.vstack([x[3:-3], dy]).T
return dydx
In [153]:
ind
Out[153]:
In [165]:
raw_k_dist
Out[165]:
In [170]:
i2
Out[170]:
In [171]:
ind[::2]
Out[171]:
In [ ]:
tip_to_tail =
In [175]:
k_plot = bokeh.plotting.figure(y_axis_label='distance')
# k_plot.line(i, k_dist/k_dist.max(), color=palette[0], legend='scaled')
k_plot.line(ind[::2], raw_k_dist[::2]/raw_k_dist.max(), color=palette[0],
line_dash=[4,2], legend='raw')
n = 1
i2 = ind[::2]
exponents = np.polyfit(i2[:75], raw_k_dist[:150:2]/raw_k_dist.max(), n)
fit = np.poly1d(exponents)
y = fit(ind[::2])
k_plot.line(i2, y, color=palette[1], legend='fit ({})'.format(n))
d1_norm = smooth_diff_n5(ind[::2], raw_k_dist[::2]/raw_k_dist.max())
# k_plot.line(d1_norm[:, 0], d1_norm[:, 1], color=palette[2], line_dash=[4,2], legend='d1')
d1_fit = np.empty([n_samples/2-1, 2])
d1_fit[:, 0] = i2[:-1]
d1_fit[:, 1] = np.diff(y)/np.diff(i2)
k_plot.line(d1_fit[:, 0], d1_fit[:, 1], color=palette[3], legend='d1 fit')
d2 = smooth_diff_n5(d1_norm[:, 0], d1_norm[:, 1])
k_plot.line(d2[:, 0], d2[:, 1], color=palette[4], line_dash=[4,2], legend='d2')
d2_fit = np.empty([n_samples/2-2, 2])
d2_fit[:, 0] = i2[1:-1]
d2_fit[:, 1] = np.diff(d1_fit[:, 1])/np.diff(d1_fit[:, 0])
k_plot.line(d2_fit[:, 0], d2_fit[:, 1], color=palette[5], legend='d2 fit')
d3 = smooth_diff_n5(d2[:, 0], d2[:, 1])
k_plot.line(d3[:, 0], d3[:, 1], color=palette[6], line_dash=[4,2], legend='d3')
d3_fit = smooth_diff_n5(d2_fit[:, 0], d2_fit[:, 1])
k_plot.line(d3_fit[:, 0], d3_fit[:, 1], color=palette[1], legend='d3 fit')
#k_plot.line(i[1:-1], d2_dist, color=palette[0], line_dash=[5,2], legend='scaled d2')
# k_plot.line(i[1:-1], d2_k_dist, color=palette[2], line_dash=[10,1], legend='scaled np d2')
# k_plot.line(i[1:], d_raw_k_dist, color=palette[3], line_dash=[10,1], legend='raw d')
# k_plot.line(i[1:-1], d2_raw_dist, color=palette[1], line_dash=[5,2], legend='raw d2')
k_plot.legend.location = "top_left"
bokeh.plotting.show(k_plot)
In [151]:
y
Out[151]:
In [231]:
x = np.arange(10)
In [238]:
xx = np.vstack([x,x]).T
In [249]:
d1_norm = smooth_diff(i, raw_k_dist/raw_k_dist.max())
In [253]:
d1_raw = smooth_diff(i, raw_k_dist)
In [273]:
x = np.linspace(0, 2*np.pi)
y = np.sin(x)
dydx = smooth_diff_n5(x, y)
d2ydx2 = smooth_diff_n5(dydx[:, 0], dydx[:, 1])
p = bokeh.plotting.figure()
p.line(x, y, legend='sin(x)', color=palette[0])
p.line(dydx[:, 0], dydx[:, 1], legend='cos(x)', color=palette[1])
p.line(d2ydx2[:, 0], d2ydx2[:, 1], legend='-sin(x)', color=palette[2])
bokeh.plotting.show(p)
In [303]:
# Compute DBSCAN
## Tune these parameters to adjust cluster size ##
distance = 1 # <--need to explore tuning this parameter
min_samples = 2
##################################################
x_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_saxs)
x_core_samples_mask = np.zeros_like(x_db.labels_, dtype=bool)
x_core_samples_mask[x_db.core_sample_indices_] = True
saxs_labels = x_db.labels_ + 1 # 0's are independent groups
x_clusters_ = len(set(saxs_labels)) - (1 if -1 in saxs_labels else 0)
n_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_sans)
n_core_samples_mask = np.zeros_like(n_db.labels_, dtype=bool)
n_core_samples_mask[n_db.core_sample_indices_] = True
sans_labels = n_db.labels_ + 1 # 0's are independent groups
n_clusters_ = len(set(sans_labels)) - (1 if -1 in sans_labels else 0)
p_db = DBSCAN(eps=distance, min_samples=min_samples).fit(scaled_pr)
p_core_samples_mask = np.zeros_like(p_db.labels_, dtype=bool)
p_core_samples_mask[p_db.core_sample_indices_] = True
pr_labels = p_db.labels_ + 1 # 0's are independent groups
p_clusters_ = len(set(pr_labels)) - (1 if -1 in pr_labels else 0)
In [267]:
# x-ray clusters
x_unique = set(saxs_labels)
x_unique.remove(0)
print('cluster labels: {}'.format(x_unique))
print('unique clusters: {}'.format(len(x_unique) + list(saxs_labels).count(0)))
for c in set(saxs_labels):
print('{}: {}'.format(c+1, list(saxs_labels).count(c)))
In [268]:
# neutron clusters
unique = set(sans_labels)
unique.remove(0)
total_clusters = len(unique) + list(sans_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(sans_labels):
print('{}: {}'.format(c, list(sans_labels).count(c)))
In [269]:
# pr clusters
unique = set(pr_labels)
unique.remove(0)
total_clusters = len(unique) + list(pr_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(pr_labels):
print('{}: {}'.format(c, list(pr_labels).count(c)))
In [ ]:
np.savetxt('saxs_clusters.txt', saxs_labels, fmt='%d')
np.savetxt('sans_clusters.txt', sans_labels, fmt='%d')
np.savetxt('pr_clusters.txt', pr_labels, fmt='%d')
In [ ]:
print(sans_labels)
print(sans_labels.shape)
slabels = np.array(sans_labels, dtype='str')
# print(slabels)
# print(slabels.shape)
In [ ]:
from matplotlib import offsetbox
i_compare = 0
mn = scaled_saxs.min(axis=0)
mx = scaled_saxs.max(axis=0)
# for i0 in range(1):
for i0 in range(18):
plt.figure()
# plot points to make the correct box size
plt.plot(mn[i0], mn[i_compare], 'w.')
plt.plot(mx[i0], mx[i_compare], 'w.')
for j in range(len(scaled_saxs)):
if slabels[j] != '0':
plt.text(scaled_saxs[j, i0], scaled_saxs[j, i_compare], slabels[j],
fontdict={'weight': 'bold', 'size': 15},
color='r') # plt.cm.Set1(labels[i]/10.0))
else:
plt.plot(scaled_saxs[j, i0], scaled_saxs[j, i_compare], 'k.',
markersize=5)
plt.xlabel(r'$Q_{}$'.format(i0))
plt.ylabel(r'$Q_{}$'.format(i_compare))
In [298]:
import hdbscan
In [326]:
# Compute DBSCAN
## Tune these parameters to adjust cluster size ##
min_samples = 2
##################################################
x_db = hdbscan.HDBSCAN(min_cluster_size=min_samples).fit_predict(scaled_saxs)
In [323]:
x_db
Out[323]:
In [327]:
saxs_labels = x_db + 1 # 0's are independent groups
x_clusters_ = len(set(saxs_labels)) - (1 if -1 in saxs_labels else 0)
n_db = hdbscan.HDBSCAN(min_cluster_size=min_samples).fit_predict(scaled_sans)
sans_labels = n_db + 1 # 0's are independent groups
n_clusters_ = len(set(sans_labels)) - (1 if -1 in sans_labels else 0)
p_db = hdbscan.HDBSCAN(min_cluster_size=min_samples).fit_predict(scaled_pr)
pr_labels = p_db + 1 # 0's are independent groups
p_clusters_ = len(set(pr_labels)) - (1 if -1 in pr_labels else 0)
In [328]:
# x-ray clusters
x_unique = set(saxs_labels)
x_unique.remove(0)
print('cluster labels: {}'.format(x_unique))
print('unique clusters: {}'.format(len(x_unique) + list(saxs_labels).count(0)))
for c in set(saxs_labels):
print('{}: {}'.format(c, list(saxs_labels).count(c)))
In [329]:
# neutron clusters
unique = set(sans_labels)
unique.remove(0)
total_clusters = len(unique) + list(sans_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(sans_labels):
print('{}: {}'.format(c, list(sans_labels).count(c)))
In [330]:
# pr clusters
unique = set(pr_labels)
unique.remove(0)
total_clusters = len(unique) + list(pr_labels).count(0)
print('cluster labels: {}'.format(unique))
print('unique clusters: {}'.format(total_clusters))
for c in set(pr_labels):
print('{}: {}'.format(c, list(pr_labels).count(c)))
In [331]:
np.savetxt('saxs_clusters.txt', saxs_labels, fmt='%d')
np.savetxt('sans_clusters.txt', sans_labels, fmt='%d')
np.savetxt('pr_clusters.txt', pr_labels, fmt='%d')
In [332]:
print(sans_labels)
print(sans_labels.shape)
slabels = np.array(sans_labels, dtype='str')
# print(slabels)
# print(slabels.shape)
In [333]:
from matplotlib import offsetbox
i_compare = 0
mn = scaled_saxs.min(axis=0)
mx = scaled_saxs.max(axis=0)
# for i0 in range(1):
for i0 in range(18):
plt.figure()
# plot points to make the correct box size
plt.plot(mn[i0], mn[i_compare], 'w.')
plt.plot(mx[i0], mx[i_compare], 'w.')
for j in range(len(scaled_saxs)):
if slabels[j] != '0':
plt.text(scaled_saxs[j, i0], scaled_saxs[j, i_compare], slabels[j],
fontdict={'weight'metrics: 'bold', 'size': 15},
color='r') # plt.cm.Set1(labels[i]/10.0))
else:
plt.plot(scaled_saxs[j, i0], scaled_saxs[j, i_compare], 'k.',
markersize=5)
plt.xlabel(r'$Q_{}$'.format(i0))
plt.ylabel(r'$Q_{}$'.format(i_compare))
In [ ]:
import sasmol.sasmol as sasmol
dcd_fname = glob.glob('*.dcd')
assert len(dcd_fname) == 1, 'ERROR: unsure which dcd file to use: {}'.format(dcd_fname)
dcd_fname = dcd_fname[0]
pdb_fname = glob.glob('*.pdb')
assert len(pdb_fname) == 1, 'ERROR: unsure which dcd file to use: {}'.format(pdb_fname)
pdb_fname = pdb_fname[0]
In [ ]:
mol = sasmol.SasMol(0)
mol.read_pdb(pdb_fname)
In [ ]:
if not np.alltrue(sans_labels == saxs_labels):
print('WARNING: labels do not match\nusing neutron labels')
labels = sans_labels
In [ ]:
In [ ]:
dcd_fname
In [ ]:
# create a dcd for every cluster with >1 frame
dcd_fnames = []
cluster_out_files = [] # dcds for clusters
unique_out_fname = '{}_uniue.dcd'.format(dcd_fname[:-4])
dcd_out_file = mol.open_dcd_write(unique_out_fname) # dcd file for unique structures
dcd_in_file = mol.open_dcd_read(dcd_fname)
for i in xrange(len(unique)):
dcd_fnames.append('{}_c{:02d}.dcd'.format(dcd_fname[:-4], i))
cluster_out_files.append(mol.open_dcd_write(dcd_fnames[i]))
visited_cluster = set()
dcd_out_frame = 0
cluster_out_frame = np.zeros(len(unique), dtype=int)
for (i, label) in enumerate(labels):
mol.read_dcd_step(dcd_in_file, i)
if label == 0:
dcd_out_frame += 1
mol.write_dcd_step(dcd_out_file, 0, dcd_out_frame)
else:
cluster_out_frame[label-1] += 1
# print('adding frame to cluster {}'.format(label-1))
# print(cluster_out_frame)
mol.write_dcd_step(cluster_out_files[label-1], 0, cluster_out_frame[label-1])
if label not in visited_cluster:
visited.add(label)
dcd_out_frame += 1
mol.write_dcd_step(dcd_out_file, 0, dcd_out_frame)
for cluster_out_file in cluster_out_files:
mol.close_dcd_write(cluster_out_file)
mol.close_dcd_write(dcd_out_file)
mol.close_dcd_read(dcd_in_file[0])
In [ ]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
In [ ]:
pca_orig = PCA()
pca_orig.fit(saxs_data)
In [ ]:
pca_scaled = PCA()
pca_scaled.fit(scaled_saxs)
In [ ]:
print(pca_orig.explained_variance_ratio_)
print(pca_scaled.explained_variance_ratio_)
In [ ]:
plt.figure()
plt.plot(q_values, pca_orig.explained_variance_ratio_, 'o', label='unscaled')
plt.plot(q_values, pca_scaled.explained_variance_ratio_, 's', label='scaled')
plt.legend()
In [ ]:
In [ ]:
from sklearn.datasets.samples_generator import make_blobs
In [ ]:
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0)
In [ ]:
X_scaled = StandardScaler().fit_transform(X)
In [ ]:
X_range = X.max(axis=0) - X.min(axis=0)
print(X_range)
In [ ]:
saxs_scaled_range = X_scaled.max(axis=0) - X_scaled.min(axis=0)
print(saxs_scaled_range)
In [ ]:
X_s2 = StandardScaler().fit_transform(X)
In [ ]:
X_s2_range = X_s2.max(axis=0) - X_s2.min(axis=0)
print(X_s2_range)
In [ ]:
In [ ]: