Alignment report


In [1]:
# From SO: https://stackoverflow.com/a/28073228/2512851

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to show/hide raw code."></form>''')


Out[1]:

In [2]:
import json
import matplotlib.pyplot as plt
import bokeh
import pandas as pd
import numpy as np

from bokeh.plotting import figure, show
from bokeh.io import output_notebook, gridplot
from bokeh.models import ColumnDataSource, HoverTool

output_notebook()

%matplotlib inline


Loading BokehJS ...

In [3]:
d = json.load(open('inputs.json'))
fname = d['csvA']
df = pd.DataFrame.from_csv(fname, index_col=None, sep=',')

Read distribution by MQ


In [4]:
def aa_mq(df):    
    correct_0 = df[df['derr']=='d = 0'][['MQ', 'count']].groupby('MQ').sum()
    correct_0.columns = ['correct_0']
    
    correct_50 = df[(df['derr']=='0 < d <= 50') | (df['derr']=='d = 0')][['MQ', 'count']].groupby('MQ').sum()
    correct_50.columns = ['correct_50']
    
    total = df[['MQ', 'count']].groupby('MQ').sum()    
    total.columns = ['total']
    
    data = pd.concat((correct_0, correct_50, total), axis=1)

    data['perr_0'] = 1 - data['correct_0'] / data['total']
    data['perr_50'] = 1 - data['correct_50'] / data['total']
    data['perr_ideal'] = 10 ** (-data.index / 10)
    
    return data

def plot_aa_mq(data):
    max_y = 10 ** np.ceil(np.log10(data['perr_0'].max()))
    min_y = 10 ** np.floor(np.log10(data['perr_50'].min()))

    source = ColumnDataSource(data)

    hover = HoverTool(tooltips=[
        ('perr 0', '@perr_0'),
        ('perr 50', '@perr_50'),
        ('perr ideal', '@perr_ideal'),        
        ('Reads', '@total')        
    ])
        
    p = figure(plot_height=200, plot_width=500, 
               x_axis_label='MQ',
               y_axis_label='p_err',
               tools=[hover, 'reset'], 
               y_axis_type="log", y_range=[min_y, max_y])

    h1 = p.circle(x='MQ', y='perr_0', size=2, source=source)
    h1 = p.circle(x='MQ', y='perr_0', size=10, alpha=0.5, color='yellow', source=source)    
    h3 = p.line(x='MQ', y='perr_ideal', source=source)

    return p

def plot_read_hist(data):
    max_y = 10 ** np.ceil(np.log10(data['total'].max()))
    min_y = 10 ** np.floor(np.log10(data['total'].min()))

    source = ColumnDataSource(data)

    hover = HoverTool(tooltips=[
        ('Reads', '@total')        
    ])
        
    p = figure(plot_height=200, plot_width=500, 
               x_axis_label='MQ',
               y_axis_label='Reads',
               tools=[hover, 'reset'], 
               y_axis_type="log", y_range=[min_y, max_y])

    h1 = p.vbar(x='MQ', bottom=min_y, top='total', width=0.7, source=source)

    return p
    

data = aa_mq(df)
s = [
    [plot_aa_mq(data)],    
    [plot_read_hist(data)]    
]

p = gridplot(s)
show(p)



In [5]:
# read_count_mq = df.groupby('MQ').sum()
# max_y = 10 ** np.ceil(np.log10(read_count_mq['count'].max()))
# min_y = 10 ** np.floor(np.log10(read_count_mq['count'].min()))

# source = ColumnDataSource(read_count_mq)


# tools = ["reset"]
# p = figure(plot_height=200, plot_width=500, 
#            x_axis_label='MQ',
#            y_axis_label='Reads',
#            tools=tools, 
#            y_axis_type="log", y_range=[min_y, max_y])

# h1 = p.vbar(x='MQ', bottom=min_y, top='count', width=0.7, source=source)

# p.add_tools(HoverTool(renderers=[h1], tooltips=[("reads", "@count")]))

# show(p)

Read distribution by alignment fate


In [6]:
read_count_by_fate = df.groupby('derr').sum()
read_count_by_fate['y'] = read_count_by_fate.index

max_x = 10 ** np.ceil(np.log10(read_count_by_fate['count'].max()))
min_x = 10 ** np.floor(np.log10(read_count_by_fate['count'].min()))

source = ColumnDataSource(read_count_by_fate)


tools = ["reset"]
p = figure(plot_height=200, plot_width=500, 
           x_axis_label='Reads',
           y_axis_label='Read fate',
           tools=tools, 
           y_range=read_count_by_fate.index.tolist(),
           x_axis_type="log", 
           x_range=[min_x, max_x])

h1 = p.hbar(y='y', left=min_x, right='count', height=0.7, source=source)

p.add_tools(HoverTool(renderers=[h1], tooltips=[("reads", "@count")]))

show(p)


Mapped rate and Alignment accuracy parametrized by MQ


In [7]:
# Matplotlib version of the plots

def heng_li_plot(df, category, ax):
    sub_df = df[df['category']==category]

    #correct = sub_df[(sub_df['derr']=='0 < d <= 50') | (sub_df['derr']=='d = 0')][['MQ', 'count']].groupby('MQ').sum()
    correct = sub_df[sub_df['derr']=='d = 0'][['MQ', 'count']].groupby('MQ').sum()
    correct.columns = ['correct']
    mapped = sub_df[sub_df['derr']!='unmapped'][['MQ', 'count']].groupby('MQ').sum()
    mapped.columns = ['mapped']
    total = sub_df[['MQ', 'count']].groupby('MQ').sum()
    total.columns = ['total']

    data = pd.concat((correct, mapped, total), axis=1)

    x = np.zeros(61, dtype=float)
    y = np.zeros(61, dtype=float)
    for mq in range(61):
        data_sub = data.iloc[mq:70]
        x[mq] = 100 * data_sub['mapped'].sum() / total.sum()
        y[mq] = 100 * data_sub['correct'].sum() / data_sub['mapped'].sum()
    ax.plot(x, y)
    ax.plot(x[0], y[0], 'ko')
    plt.setp(ax, 
             xlim=(95, 101), xticks=range(96,101), 
             ylim=(79, 101), yticks=range(80,101, 5),
             title=category)
    
    
def plot_heng_li_panels(df):
    fig = plt.figure(figsize=(10, 20))
    for n, cat in enumerate(['Ref', 'SNP', 'Multi',
                'INS <= 10', 'INS 11-50', 'INS > 50',
                'DEL <= 10', 'DEL 11-50', 'DEL > 50']):
        ax = plt.subplot(4, 3, n + 1)
        heng_li_plot(df, cat, ax)
        #plt.setp(ax, yscale='log')
        if n != 6:
            plt.setp(ax, xticklabels=[], yticklabels=[])
        else:
            plt.setp(ax, xlabel='% Mapped', ylabel='% Correct')

#plot_heng_li_panels(df)

In [8]:
def heng_li_plot(df, category):
    sub_df = df[df['category']==category]

    #correct = sub_df[(sub_df['derr']=='0 < d <= 50') | (sub_df['derr']=='d = 0')][['MQ', 'count']].groupby('MQ').sum()
    correct = sub_df[sub_df['derr']=='d = 0'][['MQ', 'count']].groupby('MQ').sum()
    correct.columns = ['correct']
    mapped = sub_df[sub_df['derr']!='unmapped'][['MQ', 'count']].groupby('MQ').sum()
    mapped.columns = ['mapped']
    total = sub_df[['MQ', 'count']].groupby('MQ').sum()
    total.columns = ['total']

    data = pd.concat((correct, mapped, total), axis=1)

    x = np.zeros(61, dtype=float)
    y = np.zeros(61, dtype=float)
    for mq in range(61):
        data_sub = data.iloc[mq:70]
        x[mq] = 100 * data_sub['mapped'].sum() / total.sum()
        y[mq] = 100 * data_sub['correct'].sum() / data_sub['mapped'].sum()
    
    source = ColumnDataSource(data=dict(
        mapped=x,
        correct=y,
        mq=range(61)
    ))
    
    hover = HoverTool(tooltips=[
        ('MQ', '≥ @mq'),
        ('Map', '@mapped %'),
        ('Correct', '@correct %')
    ])
    
    s1 = figure(width=250, plot_height=250, 
                tools=[hover, 'pan', 'reset'],
                title=category)
    
    s1.circle(x[0], y[0], size=10)
    s1.line('mapped', 'correct', source=source)
    
    return s1
    
def plot_heng_li_panels(df):
    s = []
    row_cnt = 3
    row_s = []
    for n, cat in enumerate(['Ref', 'SNP', 'Multi',
                'INS <= 10', 'INS 11-50', 'INS > 50',
                'DEL <= 10', 'DEL 11-50', 'DEL > 50']):
        if n % row_cnt == 0:
            if len(row_s):
                s.append(row_s)
            row_s = []
        row_s.append(heng_li_plot(df, cat))

    if len(row_s):
        s.append(row_s)

    p = gridplot(s)
    show(p)
    
plot_heng_li_panels(df)


NOTES

AAF_CSV -> PLOT

Takes in a summary CSV file produced from an Alignment Analysis Format (AAF) file and creates alignment analysis plots.

Expects a file called inputs.json located in the working directory of the form

{
    "csvA": <path to my csv file>
}