In [1]:
import os
import numpy as np
import scipy.stats as stats
from statsmodels.robust.scale import mad
from statsmodels.graphics.gofplots import qqplot
from statsmodels.distributions.empirical_distribution import ECDF
import pandas as pd
from matplotlib import pyplot
import flowio
%matplotlib inline
In [34]:
time_channel = "Time"
data_channel = "Blue B-A"
data_dir = "/home/swhite/Projects/flowClean_testing/data"
diff_roll = 0.01
final_roll = 0.02
k = 1.0
k2 = 1.0
figure_out_dir = "flow_rate_qc_ks"
fig_size = (16, 4)
In [35]:
def plot_channel(fcs_file, x_channel_index, y_channel_index, x_biex=True, y_biex=True, fig_size=(16, 4)):
fd = flowio.FlowData("/".join([data_dir, fcs_file]))
events = np.reshape(fd.events, (-1, fd.channel_count))
x_label = find_channel_label(fd.channels, x_channel_index)
y_label = find_channel_label(fd.channels, y_channel_index)
pre_scale = 0.003
if x_biex:
x = np.arcsinh(events[:, x_channel_index] * pre_scale)
else:
x = events[:, x_channel_index]
if y_biex:
y = np.arcsinh(events[:, y_channel_index] * pre_scale)
else:
y = events[:, y_channel_index]
my_cmap = pyplot.cm.get_cmap('jet')
my_cmap.set_under('w', alpha=0)
bins = int(np.sqrt(x.shape[0]))
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(fd.name, fontsize=16)
ax.set_xlabel(x_label, fontsize=14)
ax.set_ylabel(y_label, fontsize=14)
ax.hist2d(
x,
y,
bins=[bins, bins],
cmap=my_cmap,
vmin=0.9
)
fig.tight_layout()
pyplot.show()
In [36]:
def find_channel_index(channel_dict, pnn_text):
for k, v in channel_dict.iteritems():
if v['PnN'] == pnn_text:
index = int(k) - 1
return index
In [37]:
def find_channel_label(channel_dict, channel_index):
label = ''
for k, v in channel_dict.iteritems():
if k == str(channel_index + 1):
if v.has_key('PnS'):
label = " ".join([v['PnN'], v['PnS']])
else:
label = v['PnN']
return label
In [38]:
def calculate_flow_rate(events, time_index, roll):
time_diff = np.diff(events[:, time_index])
time_diff = np.insert(time_diff, 0, 0)
time_diff_mean = pd.rolling_mean(time_diff, roll, min_periods=1)
min_diff = time_diff_mean[time_diff_mean > 0].min()
time_diff_mean[time_diff_mean == 0] = min_diff
flow_rate = 1 / time_diff_mean
return flow_rate
In [39]:
def plot_flow_rate(
file_name,
flow_rate,
event_idx,
hline=None,
trendline=None,
x_lim=None,
y_lim=None,
save_figure=False
):
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(file_name, fontsize=16)
ax.set_xlabel("Event", fontsize=14)
ax.set_ylabel("Flow rate (events/ms)", fontsize=14)
if x_lim is None:
ax.set_xlim([0, max(event_idx)])
else:
ax.set_xlim([x_lim[0], x_lim[1]])
if y_lim is not None:
ax.set_ylim([y_lim[0], y_lim[1]])
ax.plot(
event_idx,
flow_rate,
c='darkslateblue'
)
if hline is not None:
ax.axhline(hline, linestyle='-', linewidth=1, c='coral')
if trendline is not None:
ax.plot(event_idx, trendline, c='cornflowerblue')
fig.tight_layout()
if save_figure:
fig_name = "".join([file_name, '_a_', 'flow_rate.png'])
fig_path = "/".join([figure_out_dir, fig_name])
pyplot.savefig(fig_path)
pyplot.show()
In [40]:
def plot_deviation(
file_name,
flow_rate,
event_indices,
diff,
stable_diff,
smooth_stable_diff,
threshold,
save_figure=False
):
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(file_name, fontsize=16)
ax.set_xlim([0, len(flow_rate)])
#pyplot.ylim([0, 5])
ax.set_xlabel("Event", fontsize=14)
ax.set_ylabel("Deviation (log)", fontsize=14)
ax.plot(
event_indices,
np.log10(1 + diff),
c='coral',
alpha=0.6,
linewidth=1
)
ax.plot(
event_indices,
np.log10(1 + stable_diff),
c='cornflowerblue',
alpha=0.6,
linewidth=1
)
ax.plot(
event_indices,
np.log10(1 + smooth_stable_diff),
c='darkslateblue',
alpha=1.0,
linewidth=1
)
ax.axhline(np.log10(1 + threshold), linestyle='dashed', linewidth=2, c='crimson')
fig.tight_layout()
if save_figure:
fig_name = "".join([file_name, '_b_', 'deviation.png'])
fig_path = "/".join([figure_out_dir, fig_name])
pyplot.savefig(fig_path)
pyplot.show()
In [41]:
def get_false_bounds(bool_array):
diff = np.diff(np.hstack((0, ~bool_array, 0)))
start = np.where(diff==1)
end = np.where(diff==-1)
return start[0], end[0]
In [42]:
def plot_channel_good_vs_bad(
file_name,
channel_data,
time_data,
channel_name,
good_event_map,
bi_ex=True,
save_figure=False,
drop_negative=False
):
pre_scale = 0.003
if bi_ex:
channel_data = np.arcsinh(channel_data * pre_scale)
starts, ends = get_false_bounds(good_event_map)
good_cmap = pyplot.cm.get_cmap('jet')
good_cmap.set_under('w', alpha=0)
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(file_name, fontsize=16)
ax.set_xlabel('Time', fontsize=14)
ax.set_ylabel(channel_name, fontsize=14)
bins_good = int(np.sqrt(channel_data.shape[0]))
ax.hist2d(
time_data,
channel_data,
bins=bins_good,
cmap=good_cmap,
vmin=0.9
)
for i, s in enumerate(starts):
ax.axvspan(time_data[s], time_data[ends[i] - 1], facecolor='pink', alpha=0.3, edgecolor='deeppink')
fig.tight_layout()
if drop_negative:
ax.set_ylim(ymin=0)
if save_figure:
fig_name = "".join([file_name, '_c_', 'filtered_events.png'])
fig_path = "/".join([figure_out_dir, fig_name])
pyplot.savefig(fig_path)
pyplot.show()
In [43]:
def clean_stage1(fcs_file):
fd = flowio.FlowData("/".join([data_dir, fcs_file]))
events = np.reshape(fd.events, (-1, fd.channel_count))
time_index = find_channel_index(fd.channels, time_channel)
diff_roll_count = int(diff_roll * events.shape[0])
flow_rate = calculate_flow_rate(events, time_index, diff_roll_count)
median = np.median(flow_rate)
median_diff = np.abs(flow_rate - median)
threshold = k * mad(median_diff)
#threshold = 1.0 * np.std(median_diff)
initial_good_events = median_diff < threshold
return flow_rate, median, initial_good_events, median_diff
In [44]:
def clean_stage2(flow_rate, event_indices, initial_good_events, fit_method='lin'):
good_event_indices = event_indices[initial_good_events]
if fit_method == 'lin':
line_regress = stats.linregress(good_event_indices, flow_rate[initial_good_events])
fit = (line_regress.slope * event_indices) + line_regress.intercept
elif fit_method == 'quad':
poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 2)
p = np.poly1d(poly)
fit = p(event_indices)
elif fit_method == 'cubic':
poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 3)
p = np.poly1d(poly)
fit = p(event_indices)
else:
return
return fit
In [45]:
def clean_stage3(stable_diff):
final_threshold = k2 * mad(stable_diff)
final_w = int(final_roll * stable_diff.shape[0])
smoothed_diff = pd.rolling_mean(stable_diff, window=final_w, min_periods=1, center=True)
final_good_events = smoothed_diff < final_threshold
return final_good_events, smoothed_diff, final_threshold
In [46]:
def clean(fcs_file, fit_method='lin', roll=5000):
flow_rate, median, good_events_stage1, median_diff = clean_stage1(fcs_file)
event_indices = np.arange(0, len(flow_rate))
flow_rate_fit = clean_stage2(flow_rate, event_indices, good_events_stage1, fit_method=fit_method)
plot_flow_rate(
fcs_file,
flow_rate,
event_indices,
hline=median,
trendline=flow_rate_fit
)
stable_diff = np.abs(flow_rate - flow_rate_fit)
final_good_events, smoothed_diff, final_threshold = clean_stage3(stable_diff)
plot_deviation(
fcs_file,
flow_rate,
event_indices,
median_diff,
stable_diff,
smoothed_diff,
final_threshold
)
fd2 = flowio.FlowData("/".join([data_dir, fcs_file]))
events2 = np.reshape(fd2.events, (-1, fd2.channel_count))
time_index = find_channel_index(fd2.channels, time_channel)
data_channel_index = find_channel_index(fd2.channels, data_channel)
plot_channel_good_vs_bad(
fd2.name,
events2[:, data_channel_index],
events2[:, time_index],
data_channel,
final_good_events,
drop_negative=False
)
channels = sorted([int(channel) - 1 for channel in fd2.channels])
for channel in channels[:-1]:
good_chan_events = np.arcsinh(events2[final_good_events, channel] * 0.003)
channel_name = fd2.channels[str(channel + 1)]['PnN']
cdf = ECDF(good_chan_events)
# std_chan = pd.rolling_apply(
# np.arcsinh(events2[:, channel] * 0.003),
# 10000,
# #lambda x: my_kstest(x, cdf),
# np.std,
# min_periods=1,
# center=True
# )
# mean_chan = pd.rolling_apply(
# np.arcsinh(events2[:, channel] * 0.003),
# 10000,
# #lambda x: my_kstest(x, cdf),
# np.mean,
# min_periods=1,
# center=True
# )
std_chan = pd.rolling_std(
np.arcsinh(events2[:, channel] * 0.003),
roll,
min_periods=1,
center=True
)
mean_chan = pd.rolling_mean(
np.arcsinh(events2[:, channel] * 0.003),
roll,
min_periods=1,
center=True
)
kurt_chan = pd.rolling_kurt(
np.arcsinh(events2[:, channel] * 0.003),
roll,
min_periods=1,
center=True
)
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(" - ".join([str(channel + 1), channel_name, "StdDev"]), fontsize=16)
ax.set_xlim([0, len(flow_rate)])
pyplot.scatter(np.arange(0, len(flow_rate)), std_chan, s=1, edgecolors='none')
fig.tight_layout()
pyplot.show()
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(" - ".join([str(channel + 1), channel_name, "Mean"]), fontsize=16)
ax.set_xlim([0, len(flow_rate)])
pyplot.scatter(np.arange(0, len(flow_rate)), mean_chan, s=1, edgecolors='none')
fig.tight_layout()
pyplot.show()
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(" - ".join([str(channel + 1), channel_name, "Kurt"]), fontsize=16)
ax.set_xlim([0, len(flow_rate)])
pyplot.scatter(np.arange(0, len(flow_rate)), kurt_chan, s=1, edgecolors='none')
fig.tight_layout()
pyplot.show()
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(" - ".join([str(channel + 1), channel_name, "StdDev / Mean"]), fontsize=16)
ax.set_xlim([0, len(flow_rate)])
pyplot.scatter(np.arange(0, len(flow_rate)), std_chan/mean_chan, s=1, edgecolors='none')
fig.tight_layout()
pyplot.show()
In [47]:
def my_kstest(data, cdf):
ks = stats.kstest(data, cdf)
return ks.pvalue
In [48]:
fcs_files = os.listdir(data_dir)
fcs_files = sorted(fcs_files)
In [49]:
plot_channel(fcs_files[7], x_channel_index=14, y_channel_index=11, x_biex=False, y_biex=True, fig_size=(16, 12))
In [50]:
weird_ones = [7] #10, 15, 24, 26, 27]
for weirdo in weird_ones:
clean(fcs_files[weirdo], fit_method='lin', roll=2000)
In [ ]: