In [1]:
import os
import numpy as np
import scipy.stats as stats
from statsmodels.robust.scale import mad
from statsmodels.graphics.gofplots import qqplot
import pandas as pd
from matplotlib import pyplot
import flowio
%matplotlib inline
In [2]:
time_channel = "Time"
data_channel = "SSC-A"
data_dir = "/home/swhite/Projects/flowClean_testing/data"
diff_roll = 0.01
final_roll = 0.02
k = 10.0
k2 = 3.0
figure_out_dir = "flow_rate_qc_cubic_square"
fig_size = (16, 4)
In [3]:
def find_channel_index(channel_dict, pnn_text):
for k, v in channel_dict.iteritems():
if v['PnN'] == pnn_text:
index = int(k) - 1
return index
In [4]:
def calculate_flow_rate(events, time_index, roll):
time_diff = np.diff(events[:, time_index])
time_diff = np.insert(time_diff, 0, 0)
time_diff_mean = pd.rolling_mean(time_diff, roll, min_periods=1)
min_diff = time_diff_mean[time_diff_mean > 0].min()
time_diff_mean[time_diff_mean == 0] = min_diff
flow_rate = 1 / time_diff_mean
return flow_rate
In [5]:
def plot_flow_rate(
file_name,
flow_rate,
event_idx,
hline=None,
trendline=None,
x_lim=None,
y_lim=None,
save_figure=False
):
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(file_name, fontsize=16)
ax.set_xlabel("Event", fontsize=14)
ax.set_ylabel("Flow rate (events/ms)", fontsize=14)
if x_lim is None:
ax.set_xlim([0, max(event_idx)])
else:
ax.set_xlim([x_lim[0], x_lim[1]])
if y_lim is not None:
ax.set_ylim([y_lim[0], y_lim[1]])
ax.plot(
event_idx,
flow_rate,
c='darkslateblue'
)
if hline is not None:
ax.axhline(hline, linestyle='-', linewidth=1, c='coral')
if trendline is not None:
ax.plot(event_idx, trendline, c='cornflowerblue')
fig.tight_layout()
if save_figure:
fig_name = "".join([file_name, '_a_', 'flow_rate.png'])
fig_path = "/".join([figure_out_dir, fig_name])
pyplot.savefig(fig_path)
pyplot.show()
In [6]:
def plot_deviation(
file_name,
flow_rate,
event_indices,
diff,
stable_diff,
smooth_stable_diff,
threshold,
save_figure=False
):
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(file_name, fontsize=16)
ax.set_xlim([0, len(flow_rate)])
#pyplot.ylim([0, 5])
ax.set_xlabel("Event", fontsize=14)
ax.set_ylabel("Deviation (log)", fontsize=14)
ax.plot(
event_indices,
np.log10(1 + diff),
c='coral',
alpha=0.6,
linewidth=1
)
ax.plot(
event_indices,
np.log10(1 + stable_diff),
c='cornflowerblue',
alpha=0.6,
linewidth=1
)
ax.plot(
event_indices,
np.log10(1 + smooth_stable_diff),
c='darkslateblue',
alpha=1.0,
linewidth=1
)
ax.axhline(np.log10(1 + threshold), linestyle='dashed', linewidth=2, c='crimson')
fig.tight_layout()
if save_figure:
fig_name = "".join([file_name, '_b_', 'deviation.png'])
fig_path = "/".join([figure_out_dir, fig_name])
pyplot.savefig(fig_path)
pyplot.show()
In [7]:
def get_false_bounds(bool_array):
diff = np.diff(np.hstack((0, ~bool_array, 0)))
start = np.where(diff==1)
end = np.where(diff==-1)
return start[0], end[0]
In [8]:
def plot_channel_good_vs_bad(
file_name,
channel_data,
time_data,
channel_name,
good_event_map,
bi_ex=True,
save_figure=False,
drop_negative=False
):
pre_scale = 0.003
if bi_ex:
channel_data = np.arcsinh(channel_data * pre_scale)
starts, ends = get_false_bounds(good_event_map)
good_cmap = pyplot.cm.get_cmap('jet')
good_cmap.set_under('w', alpha=0)
fig = pyplot.figure(figsize=fig_size)
ax = fig.add_subplot(1, 1, 1)
ax.set_title(file_name, fontsize=16)
ax.set_xlabel('Time', fontsize=14)
ax.set_ylabel(channel_name, fontsize=14)
bins_good = int(np.sqrt(channel_data.shape[0]))
ax.hist2d(
time_data,
channel_data,
bins=bins_good,
cmap=good_cmap,
vmin=0.9
)
for i, s in enumerate(starts):
ax.axvspan(time_data[s], time_data[ends[i] - 1], facecolor='pink', alpha=0.3, edgecolor='deeppink')
fig.tight_layout()
if drop_negative:
ax.set_ylim(ymin=0)
if save_figure:
fig_name = "".join([file_name, '_c_', 'filtered_events.png'])
fig_path = "/".join([figure_out_dir, fig_name])
pyplot.savefig(fig_path)
pyplot.show()
In [9]:
def clean_stage1(fcs_file):
fd = flowio.FlowData("/".join([data_dir, fcs_file]))
events = np.reshape(fd.events, (-1, fd.channel_count))
time_index = find_channel_index(fd.channels, time_channel)
diff_roll_count = int(diff_roll * events.shape[0])
flow_rate = calculate_flow_rate(events, time_index, diff_roll_count)
median = np.median(flow_rate)
median_diff = np.abs(flow_rate - median)
threshold = k * mad(median_diff)
#threshold = 1.0 * np.std(median_diff)
initial_good_events = median_diff < threshold
return flow_rate, median, initial_good_events, median_diff
In [10]:
def clean_stage2(flow_rate, event_indices, initial_good_events, fit_method='lin'):
good_event_indices = event_indices[initial_good_events]
if fit_method == 'lin':
line_regress = stats.linregress(good_event_indices, flow_rate[initial_good_events])
fit = (line_regress.slope * event_indices) + line_regress.intercept
elif fit_method == 'quad':
poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 2)
p = np.poly1d(poly)
fit = p(event_indices)
elif fit_method == 'cubic':
poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 3)
p = np.poly1d(poly)
fit = p(event_indices)
else:
return
return fit
In [11]:
def clean_stage3(stable_diff):
final_threshold = k2 * mad(stable_diff)
final_w = int(final_roll * stable_diff.shape[0])
smoothed_diff = pd.rolling_mean(stable_diff, window=final_w, min_periods=1, center=True)
final_good_events = smoothed_diff < final_threshold
return final_good_events, smoothed_diff, final_threshold
In [12]:
def clean(fcs_file, fit_method='lin'):
flow_rate, median, good_events_stage1, median_diff = clean_stage1(fcs_file)
event_indices = np.arange(0, len(flow_rate))
flow_rate_fit = clean_stage2(flow_rate, event_indices, good_events_stage1, fit_method=fit_method)
plot_flow_rate(
fcs_file,
flow_rate,
event_indices,
hline=median,
trendline=flow_rate_fit
)
stable_diff = np.abs(flow_rate - flow_rate_fit)
final_good_events, smoothed_diff, final_threshold = clean_stage3(stable_diff)
plot_deviation(
fcs_file,
flow_rate,
event_indices,
median_diff,
stable_diff,
smoothed_diff,
final_threshold
)
fd2 = flowio.FlowData("/".join([data_dir, fcs_file]))
events2 = np.reshape(fd2.events, (-1, fd2.channel_count))
time_index = find_channel_index(fd2.channels, time_channel)
data_channel_index = find_channel_index(fd2.channels, data_channel)
plot_channel_good_vs_bad(
fd2.name,
events2[:, data_channel_index],
events2[:, time_index],
data_channel,
final_good_events,
drop_negative=True
)
In [13]:
fcs_files = os.listdir(data_dir)
fcs_files = sorted(fcs_files)
In [14]:
weird_ones = [7, 10, 15, 24, 26, 27]
for weirdo in weird_ones:
clean(fcs_files[weirdo], fit_method='lin')
clean(fcs_files[weirdo], fit_method='quad')
clean(fcs_files[weirdo], fit_method='cubic')
#np.savetxt(".".join([fcs_files[weirdo], 'csv']), flow_rate, delimiter=",")
In [ ]: