notebook.community

Edit and run



In [1]:

    
import os
import numpy as np
import scipy.stats as stats
from statsmodels.robust.scale import mad
from statsmodels.graphics.gofplots import qqplot
import pandas as pd
from matplotlib import pyplot

import flowio

%matplotlib inline



In [2]:

    
time_channel = "Time"
data_channel = "SSC-A"
data_dir = "/home/swhite/Projects/flowClean_testing/data"
diff_roll = 0.01
final_roll = 0.02
k = 10.0
k2 = 3.0

figure_out_dir = "flow_rate_qc_cubic_square"
fig_size = (16, 4)



In [3]:

    
def find_channel_index(channel_dict, pnn_text):
    for k, v in channel_dict.iteritems():
        if v['PnN'] == pnn_text:
            index = int(k) - 1
    
    return index



In [4]:

    
def calculate_flow_rate(events, time_index, roll):
    time_diff = np.diff(events[:, time_index])
    time_diff = np.insert(time_diff, 0, 0)
    
    time_diff_mean = pd.rolling_mean(time_diff, roll, min_periods=1)
    min_diff = time_diff_mean[time_diff_mean > 0].min()
    time_diff_mean[time_diff_mean == 0] = min_diff
    
    flow_rate = 1 / time_diff_mean
    
    return flow_rate



In [5]:

    
def plot_flow_rate(
        file_name, 
        flow_rate, 
        event_idx, 
        hline=None, 
        trendline=None, 
        x_lim=None, 
        y_lim=None,
        save_figure=False
    ):
    
    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)
    
    ax.set_title(file_name, fontsize=16)
    
    ax.set_xlabel("Event", fontsize=14)
    ax.set_ylabel("Flow rate (events/ms)", fontsize=14)
    
    if x_lim is None:
        ax.set_xlim([0, max(event_idx)])
    else:
        ax.set_xlim([x_lim[0], x_lim[1]])
    
    if y_lim is not None:
        ax.set_ylim([y_lim[0], y_lim[1]])
    
    ax.plot(
        event_idx,
        flow_rate,
        c='darkslateblue'
    )
    
    if hline is not None:
        ax.axhline(hline, linestyle='-', linewidth=1, c='coral')
    
    if trendline is not None:
        ax.plot(event_idx, trendline, c='cornflowerblue')
    
    fig.tight_layout()
        
    if save_figure:
        fig_name = "".join([file_name, '_a_', 'flow_rate.png'])
        fig_path = "/".join([figure_out_dir, fig_name])
        pyplot.savefig(fig_path)

    pyplot.show()



In [6]:

    
def plot_deviation(
        file_name, 
        flow_rate, 
        event_indices, 
        diff, 
        stable_diff, 
        smooth_stable_diff, 
        threshold,
        save_figure=False
    ):
    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)
    
    ax.set_title(file_name, fontsize=16)

    ax.set_xlim([0, len(flow_rate)])
    #pyplot.ylim([0, 5])

    ax.set_xlabel("Event", fontsize=14)
    ax.set_ylabel("Deviation (log)", fontsize=14)

    ax.plot(
        event_indices,
        np.log10(1 + diff),
        c='coral',
        alpha=0.6,
        linewidth=1
    )

    ax.plot(
        event_indices,
        np.log10(1 + stable_diff),
        c='cornflowerblue',
        alpha=0.6,
        linewidth=1
    )

    ax.plot(
        event_indices,
        np.log10(1 + smooth_stable_diff),
        c='darkslateblue',
        alpha=1.0,
        linewidth=1
    )

    ax.axhline(np.log10(1 + threshold), linestyle='dashed', linewidth=2, c='crimson')
    
    fig.tight_layout()
        
    if save_figure:
        fig_name = "".join([file_name, '_b_', 'deviation.png'])
        fig_path = "/".join([figure_out_dir, fig_name])
        pyplot.savefig(fig_path)

    pyplot.show()



In [7]:

    
def get_false_bounds(bool_array):
    diff = np.diff(np.hstack((0, ~bool_array, 0)))
    
    start = np.where(diff==1)
    end = np.where(diff==-1)
    
    return start[0], end[0]



In [8]:

    
def plot_channel_good_vs_bad(
        file_name,
        channel_data,
        time_data,
        channel_name,
        good_event_map,
        bi_ex=True,
        save_figure=False,
        drop_negative=False
    ):

    pre_scale = 0.003
    
    if bi_ex:
        channel_data = np.arcsinh(channel_data * pre_scale)
    
    starts, ends = get_false_bounds(good_event_map)

    good_cmap = pyplot.cm.get_cmap('jet')
    good_cmap.set_under('w', alpha=0)

    fig = pyplot.figure(figsize=fig_size)
    ax = fig.add_subplot(1, 1, 1)

    ax.set_title(file_name, fontsize=16)
    ax.set_xlabel('Time', fontsize=14)
    ax.set_ylabel(channel_name, fontsize=14)

    bins_good = int(np.sqrt(channel_data.shape[0]))

    ax.hist2d(
        time_data,
        channel_data,
        bins=bins_good,
        cmap=good_cmap,
        vmin=0.9
    )

    for i, s in enumerate(starts):
        ax.axvspan(time_data[s], time_data[ends[i] - 1], facecolor='pink', alpha=0.3, edgecolor='deeppink')

    fig.tight_layout()

    if drop_negative:
        ax.set_ylim(ymin=0)
        
    if save_figure:
        fig_name = "".join([file_name, '_c_', 'filtered_events.png'])
        fig_path = "/".join([figure_out_dir, fig_name])
        pyplot.savefig(fig_path)

    pyplot.show()



In [9]:

    
def clean_stage1(fcs_file):
    fd = flowio.FlowData("/".join([data_dir, fcs_file]))
    events = np.reshape(fd.events, (-1, fd.channel_count))
    
    time_index = find_channel_index(fd.channels, time_channel)
    
    diff_roll_count = int(diff_roll * events.shape[0])
    flow_rate = calculate_flow_rate(events, time_index, diff_roll_count)
    
    median = np.median(flow_rate)
    median_diff = np.abs(flow_rate - median)
    
    threshold = k * mad(median_diff)
    #threshold = 1.0 * np.std(median_diff)
        
    initial_good_events = median_diff < threshold
    
    return flow_rate, median, initial_good_events, median_diff



In [10]:

    
def clean_stage2(flow_rate, event_indices, initial_good_events, fit_method='lin'):
    good_event_indices = event_indices[initial_good_events]
    
    if fit_method == 'lin':
        line_regress = stats.linregress(good_event_indices, flow_rate[initial_good_events])
        fit = (line_regress.slope * event_indices) + line_regress.intercept
    elif fit_method == 'quad':
        poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 2)
        p = np.poly1d(poly)
        fit = p(event_indices)
    elif fit_method == 'cubic':
        poly = np.polyfit(good_event_indices, flow_rate[initial_good_events], 3)
        p = np.poly1d(poly)
        fit = p(event_indices)
    else:
        return
    
    return fit



In [11]:

    
def clean_stage3(stable_diff):
    final_threshold = k2 * mad(stable_diff)
    
    final_w = int(final_roll * stable_diff.shape[0])
    smoothed_diff = pd.rolling_mean(stable_diff, window=final_w, min_periods=1, center=True)
    final_good_events = smoothed_diff < final_threshold
    
    return final_good_events, smoothed_diff, final_threshold



In [12]:

    
def clean(fcs_file, fit_method='lin'):
    flow_rate, median, good_events_stage1, median_diff = clean_stage1(fcs_file)

    event_indices = np.arange(0, len(flow_rate))
    
    flow_rate_fit = clean_stage2(flow_rate, event_indices, good_events_stage1, fit_method=fit_method)
    
    plot_flow_rate(
        fcs_file, 
        flow_rate, 
        event_indices, 
        hline=median, 
        trendline=flow_rate_fit
    )
    
    stable_diff = np.abs(flow_rate - flow_rate_fit)
    
    final_good_events, smoothed_diff, final_threshold = clean_stage3(stable_diff)
    
    plot_deviation(
        fcs_file, 
        flow_rate, 
        event_indices, 
        median_diff, 
        stable_diff, 
        smoothed_diff, 
        final_threshold
    )
    
    fd2 = flowio.FlowData("/".join([data_dir, fcs_file]))
    events2 = np.reshape(fd2.events, (-1, fd2.channel_count))

    time_index = find_channel_index(fd2.channels, time_channel)
    data_channel_index = find_channel_index(fd2.channels, data_channel)
    
    plot_channel_good_vs_bad(
        fd2.name, 
        events2[:, data_channel_index], 
        events2[:, time_index], 
        data_channel, 
        final_good_events,
        drop_negative=True
    )



In [13]:

    
fcs_files = os.listdir(data_dir)
fcs_files = sorted(fcs_files)



In [14]:

    
weird_ones = [7, 10, 15, 24, 26, 27]

for weirdo in weird_ones:
    clean(fcs_files[weirdo], fit_method='lin')
    clean(fcs_files[weirdo], fit_method='quad')
    clean(fcs_files[weirdo], fit_method='cubic')
    #np.savetxt(".".join([fcs_files[weirdo], 'csv']), flow_rate, delimiter=",")



In [ ]: