notebook.community

Edit and run



In [1]:

    
import os
import numpy as np
import scipy.stats as stats
from statsmodels.robust.scale import mad
from statsmodels.graphics.gofplots import qqplot
from statsmodels.distributions.empirical_distribution import ECDF
import pandas as pd
from matplotlib import pyplot

import flowio

%matplotlib inline



In [34]:

    
time_channel = "Time"
data_channel = "Blue B-A"
data_dir = "/home/swhite/Projects/flowClean_testing/data"
diff_roll = 0.01
final_roll = 0.02
k = 1.0
k2 = 1.0

figure_out_dir = "flow_rate_qc_ks"
fig_size = (16, 4)



In [35]:

    
def plot_channel(fcs_file, x_channel_index, y_channel_index, x_biex=True, y_biex=True, fig_size=(16, 4)):
    fd = flowio.FlowData("/".join([data_dir, fcs_file]))
    events = np.reshape(fd.events, (-1, fd.channel_count))
    
    x_label = find_channel_label(fd.channels, x_channel_index)
    y_label = find_channel_label(fd.channels, y_channel_index)
    
    pre_scale = 0.003
    
    if x_biex:
        x = np.arcsinh(events[:, x_channel_index] * pre_scale)
    else:
        x = events[:, x_channel_index]
    
    if y_biex:
        y = np.arcsinh(events[:, y_channel_index] * pre_scale)
    else:
        y = events[:, y_channel_index]
    
    my_cmap = pyplot.cm.get_cmap('jet')
    my_cmap.set_under('w', alpha=0)
    
    bins = int(np.sqrt(x.shape[0]))
    
    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)
    
    ax.set_title(fd.name, fontsize=16)
    
    ax.set_xlabel(x_label, fontsize=14)
    ax.set_ylabel(y_label, fontsize=14)
    
    ax.hist2d(
        x,
        y,
        bins=[bins, bins],
        cmap=my_cmap,
        vmin=0.9
    )
    
    fig.tight_layout()
    
    pyplot.show()



In [36]:

    
def find_channel_index(channel_dict, pnn_text):
    for k, v in channel_dict.iteritems():
        if v['PnN'] == pnn_text:
            index = int(k) - 1
    
    return index



In [37]:

    
def find_channel_label(channel_dict, channel_index):
    label = ''
    
    for k, v in channel_dict.iteritems():
        if k == str(channel_index + 1):
            if v.has_key('PnS'):
                label = " ".join([v['PnN'], v['PnS']])
            else:
                label = v['PnN']
    
    return label



In [38]:

    
def calculate_flow_rate(events, time_index, roll):
    time_diff = np.diff(events[:, time_index])
    time_diff = np.insert(time_diff, 0, 0)
    
    time_diff_mean = pd.rolling_mean(time_diff, roll, min_periods=1)
    min_diff = time_diff_mean[time_diff_mean > 0].min()
    time_diff_mean[time_diff_mean == 0] = min_diff
    
    flow_rate = 1 / time_diff_mean
    
    return flow_rate



In [39]:

    
def plot_flow_rate(
        file_name, 
        flow_rate, 
        event_idx, 
        hline=None, 
        trendline=None, 
        x_lim=None, 
        y_lim=None,
        save_figure=False
    ):
    
    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)
    
    ax.set_title(file_name, fontsize=16)
    
    ax.set_xlabel("Event", fontsize=14)
    ax.set_ylabel("Flow rate (events/ms)", fontsize=14)
    
    if x_lim is None:
        ax.set_xlim([0, max(event_idx)])
    else:
        ax.set_xlim([x_lim[0], x_lim[1]])
    
    if y_lim is not None:
        ax.set_ylim([y_lim[0], y_lim[1]])
    
    ax.plot(
        event_idx,
        flow_rate,
        c='darkslateblue'
    )
    
    if hline is not None:
        ax.axhline(hline, linestyle='-', linewidth=1, c='coral')
    
    if trendline is not None:
        ax.plot(event_idx, trendline, c='cornflowerblue')
    
    fig.tight_layout()
        
    if save_figure:
        fig_name = "".join([file_name, '_a_', 'flow_rate.png'])
        fig_path = "/".join([figure_out_dir, fig_name])
        pyplot.savefig(fig_path)

    pyplot.show()



In [40]:

    
def plot_deviation(
        file_name, 
        flow_rate, 
        event_indices, 
        diff, 
        stable_diff, 
        smooth_stable_diff, 
        threshold,
        save_figure=False
    ):
    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)
    
    ax.set_title(file_name, fontsize=16)

    ax.set_xlim([0, len(flow_rate)])
    #pyplot.ylim([0, 5])

    ax.set_xlabel("Event", fontsize=14)
    ax.set_ylabel("Deviation (log)", fontsize=14)

    ax.plot(
        event_indices,
        np.log10(1 + diff),
        c='coral',
        alpha=0.6,
        linewidth=1
    )

    ax.plot(
        event_indices,
        np.log10(1 + stable_diff),
        c='cornflowerblue',
        alpha=0.6,
        linewidth=1
    )

    ax.plot(
        event_indices,
        np.log10(1 + smooth_stable_diff),
        c='darkslateblue',
        alpha=1.0,
        linewidth=1
    )

    ax.axhline(np.log10(1 + threshold), linestyle='dashed', linewidth=2, c='crimson')
    
    fig.tight_layout()
        
    if save_figure:
        fig_name = "".join([file_name, '_b_', 'deviation.png'])
        fig_path = "/".join([figure_out_dir, fig_name])
        pyplot.savefig(fig_path)

    pyplot.show()



In [41]:

    
def get_false_bounds(bool_array):
    diff = np.diff(np.hstack((0, ~bool_array, 0)))
    
    start = np.where(diff==1)
    end = np.where(diff==-1)
    
    return start[0], end[0]



In [42]:

    
def plot_channel_good_vs_bad(
        file_name,
        channel_data,
        time_data,
        channel_name,
        good_event_map,
        bi_ex=True,
        save_figure=False,
        drop_negative=False
    ):

    pre_scale = 0.003
    
    if bi_ex:
        channel_data = np.arcsinh(channel_data * pre_scale)
    
    starts, ends = get_false_bounds(good_event_map)

    good_cmap = pyplot.cm.get_cmap('jet')
    good_cmap.set_under('w', alpha=0)

    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)

    ax.set_title(file_name, fontsize=16)
    ax.set_xlabel('Time', fontsize=14)
    ax.set_ylabel(channel_name, fontsize=14)

    bins_good = int(np.sqrt(channel_data.shape[0]))

    ax.hist2d(
        time_data,
        channel_data,
        bins=bins_good,
        cmap=good_cmap,
        vmin=0.9
    )

    for i, s in enumerate(starts):
        ax.axvspan(time_data[s], time_data[ends[i] - 1], facecolor='pink', alpha=0.3, edgecolor='deeppink')

    fig.tight_layout()

    if drop_negative:
        ax.set_ylim(ymin=0)
        
    if save_figure:
        fig_name = "".join([file_name, '_c_', 'filtered_events.png'])
        fig_path = "/".join([figure_out_dir, fig_name])
        pyplot.savefig(fig_path)

    pyplot.show()



In [43]:

    
def clean_stage1(fcs_file):
    fd = flowio.FlowData("/".join([data_dir, fcs_file]))
    events = np.reshape(fd.events, (-1, fd.channel_count))
    
    time_index = find_channel_index(fd.channels, time_channel)
    
    diff_roll_count = int(diff_roll * events.shape[0])
    flow_rate = calculate_flow_rate(events, time_index, diff_roll_count)
    
    median = np.median(flow_rate)
    median_diff = np.abs(flow_rate - median)
    
    threshold = k * mad(median_diff)
    #threshold = 1.0 * np.std(median_diff)
        
    initial_good_events = median_diff < threshold
    
    return flow_rate, median, initial_good_events, median_diff



In [44]:

    
def clean_stage2(flow_rate, event_indices, initial_good_events, fit_method='lin'):
    good_event_indices = event_indices[initial_good_events]
    
    if fit_method == 'lin':
        line_regress = stats.linregress(good_event_indices, flow_rate[initial_good_events])
        fit = (line_regress.slope * event_indices) + line_regress.intercept
    elif fit_method == 'quad':
        poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 2)
        p = np.poly1d(poly)
        fit = p(event_indices)
    elif fit_method == 'cubic':
        poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 3)
        p = np.poly1d(poly)
        fit = p(event_indices)
    else:
        return
    
    return fit



In [45]:

    
def clean_stage3(stable_diff):
    final_threshold = k2 * mad(stable_diff)
    
    final_w = int(final_roll * stable_diff.shape[0])
    smoothed_diff = pd.rolling_mean(stable_diff, window=final_w, min_periods=1, center=True)
    final_good_events = smoothed_diff < final_threshold
    
    return final_good_events, smoothed_diff, final_threshold



In [46]:

    
def clean(fcs_file, fit_method='lin', roll=5000):
    flow_rate, median, good_events_stage1, median_diff = clean_stage1(fcs_file)

    event_indices = np.arange(0, len(flow_rate))
    
    flow_rate_fit = clean_stage2(flow_rate, event_indices, good_events_stage1, fit_method=fit_method)
    
    plot_flow_rate(
        fcs_file, 
        flow_rate, 
        event_indices, 
        hline=median, 
        trendline=flow_rate_fit
    )
    
    stable_diff = np.abs(flow_rate - flow_rate_fit)
    
    final_good_events, smoothed_diff, final_threshold = clean_stage3(stable_diff)
    
    plot_deviation(
        fcs_file, 
        flow_rate, 
        event_indices, 
        median_diff, 
        stable_diff, 
        smoothed_diff, 
        final_threshold
    )
    
    fd2 = flowio.FlowData("/".join([data_dir, fcs_file]))
    events2 = np.reshape(fd2.events, (-1, fd2.channel_count))

    time_index = find_channel_index(fd2.channels, time_channel)
    data_channel_index = find_channel_index(fd2.channels, data_channel)
    
    plot_channel_good_vs_bad(
        fd2.name, 
        events2[:, data_channel_index], 
        events2[:, time_index], 
        data_channel, 
        final_good_events,
        drop_negative=False
    )
    
    channels = sorted([int(channel) - 1 for channel in fd2.channels])
    
    for channel in channels[:-1]:
        good_chan_events = np.arcsinh(events2[final_good_events, channel] * 0.003)
        
        channel_name = fd2.channels[str(channel + 1)]['PnN']
        
        cdf = ECDF(good_chan_events)
                
#         std_chan = pd.rolling_apply(
#             np.arcsinh(events2[:, channel] * 0.003), 
#             10000, 
#             #lambda x: my_kstest(x, cdf),
#             np.std,
#             min_periods=1,
#             center=True
#         )
        
#         mean_chan = pd.rolling_apply(
#             np.arcsinh(events2[:, channel] * 0.003), 
#             10000, 
#             #lambda x: my_kstest(x, cdf),
#             np.mean,
#             min_periods=1,
#             center=True
#         )
        
        std_chan = pd.rolling_std(
            np.arcsinh(events2[:, channel] * 0.003), 
            roll, 
            min_periods=1,
            center=True
        )
        
        mean_chan = pd.rolling_mean(
            np.arcsinh(events2[:, channel] * 0.003), 
            roll, 
            min_periods=1,
            center=True
        )
        
        kurt_chan = pd.rolling_kurt(
            np.arcsinh(events2[:, channel] * 0.003), 
            roll, 
            min_periods=1,
            center=True
        )
        
        fig = pyplot.figure(figsize=fig_size)
        ax = fig.add_subplot(1, 1, 1)
        ax.set_title(" - ".join([str(channel + 1), channel_name, "StdDev"]), fontsize=16)
        ax.set_xlim([0, len(flow_rate)])
        pyplot.scatter(np.arange(0, len(flow_rate)), std_chan, s=1, edgecolors='none')
        fig.tight_layout()
        pyplot.show()
        
        fig = pyplot.figure(figsize=fig_size)
        ax = fig.add_subplot(1, 1, 1)
        ax.set_title(" - ".join([str(channel + 1), channel_name, "Mean"]), fontsize=16)
        ax.set_xlim([0, len(flow_rate)])
        pyplot.scatter(np.arange(0, len(flow_rate)), mean_chan, s=1, edgecolors='none')
        fig.tight_layout()
        pyplot.show()
        
        fig = pyplot.figure(figsize=fig_size)
        ax = fig.add_subplot(1, 1, 1)
        ax.set_title(" - ".join([str(channel + 1), channel_name, "Kurt"]), fontsize=16)
        ax.set_xlim([0, len(flow_rate)])
        pyplot.scatter(np.arange(0, len(flow_rate)), kurt_chan, s=1, edgecolors='none')
        fig.tight_layout()
        pyplot.show()
        
        fig = pyplot.figure(figsize=fig_size)
        ax = fig.add_subplot(1, 1, 1)
        ax.set_title(" - ".join([str(channel + 1), channel_name, "StdDev / Mean"]), fontsize=16)
        ax.set_xlim([0, len(flow_rate)])
        pyplot.scatter(np.arange(0, len(flow_rate)), std_chan/mean_chan, s=1, edgecolors='none')
        fig.tight_layout()
        pyplot.show()



In [47]:

    
def my_kstest(data, cdf):
    ks = stats.kstest(data, cdf)
    
    return ks.pvalue



In [48]:

    
fcs_files = os.listdir(data_dir)
fcs_files = sorted(fcs_files)



In [49]:

    
plot_channel(fcs_files[7], x_channel_index=14, y_channel_index=11, x_biex=False, y_biex=True, fig_size=(16, 12))



In [50]:

    
weird_ones = [7] #10, 15, 24, 26, 27]

for weirdo in weird_ones:
    clean(fcs_files[weirdo], fit_method='lin', roll=2000)



In [ ]: