In [1]:
# From SO: https://stackoverflow.com/a/28073228/2512851
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to show/hide raw code."></form>''')
Out[1]:
In [2]:
import json
import matplotlib.pyplot as plt
import bokeh
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, gridplot
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()
%matplotlib inline
In [3]:
d = json.load(open('inputs.json'))
fname = d['csvA']
df = pd.DataFrame.from_csv(fname, index_col=None, sep=',')
In [4]:
def aa_mq(df):
correct_0 = df[df['derr']=='d = 0'][['MQ', 'count']].groupby('MQ').sum()
correct_0.columns = ['correct_0']
correct_50 = df[(df['derr']=='0 < d <= 50') | (df['derr']=='d = 0')][['MQ', 'count']].groupby('MQ').sum()
correct_50.columns = ['correct_50']
total = df[['MQ', 'count']].groupby('MQ').sum()
total.columns = ['total']
data = pd.concat((correct_0, correct_50, total), axis=1)
data['perr_0'] = 1 - data['correct_0'] / data['total']
data['perr_50'] = 1 - data['correct_50'] / data['total']
data['perr_ideal'] = 10 ** (-data.index / 10)
return data
def plot_aa_mq(data):
max_y = 10 ** np.ceil(np.log10(data['perr_0'].max()))
min_y = 10 ** np.floor(np.log10(data['perr_50'].min()))
source = ColumnDataSource(data)
hover = HoverTool(tooltips=[
('perr 0', '@perr_0'),
('perr 50', '@perr_50'),
('perr ideal', '@perr_ideal'),
('Reads', '@total')
])
p = figure(plot_height=200, plot_width=500,
x_axis_label='MQ',
y_axis_label='p_err',
tools=[hover, 'reset'],
y_axis_type="log", y_range=[min_y, max_y])
h1 = p.circle(x='MQ', y='perr_0', size=2, source=source)
h1 = p.circle(x='MQ', y='perr_0', size=10, alpha=0.5, color='yellow', source=source)
h3 = p.line(x='MQ', y='perr_ideal', source=source)
return p
def plot_read_hist(data):
max_y = 10 ** np.ceil(np.log10(data['total'].max()))
min_y = 10 ** np.floor(np.log10(data['total'].min()))
source = ColumnDataSource(data)
hover = HoverTool(tooltips=[
('Reads', '@total')
])
p = figure(plot_height=200, plot_width=500,
x_axis_label='MQ',
y_axis_label='Reads',
tools=[hover, 'reset'],
y_axis_type="log", y_range=[min_y, max_y])
h1 = p.vbar(x='MQ', bottom=min_y, top='total', width=0.7, source=source)
return p
data = aa_mq(df)
s = [
[plot_aa_mq(data)],
[plot_read_hist(data)]
]
p = gridplot(s)
show(p)
In [5]:
# read_count_mq = df.groupby('MQ').sum()
# max_y = 10 ** np.ceil(np.log10(read_count_mq['count'].max()))
# min_y = 10 ** np.floor(np.log10(read_count_mq['count'].min()))
# source = ColumnDataSource(read_count_mq)
# tools = ["reset"]
# p = figure(plot_height=200, plot_width=500,
# x_axis_label='MQ',
# y_axis_label='Reads',
# tools=tools,
# y_axis_type="log", y_range=[min_y, max_y])
# h1 = p.vbar(x='MQ', bottom=min_y, top='count', width=0.7, source=source)
# p.add_tools(HoverTool(renderers=[h1], tooltips=[("reads", "@count")]))
# show(p)
In [6]:
read_count_by_fate = df.groupby('derr').sum()
read_count_by_fate['y'] = read_count_by_fate.index
max_x = 10 ** np.ceil(np.log10(read_count_by_fate['count'].max()))
min_x = 10 ** np.floor(np.log10(read_count_by_fate['count'].min()))
source = ColumnDataSource(read_count_by_fate)
tools = ["reset"]
p = figure(plot_height=200, plot_width=500,
x_axis_label='Reads',
y_axis_label='Read fate',
tools=tools,
y_range=read_count_by_fate.index.tolist(),
x_axis_type="log",
x_range=[min_x, max_x])
h1 = p.hbar(y='y', left=min_x, right='count', height=0.7, source=source)
p.add_tools(HoverTool(renderers=[h1], tooltips=[("reads", "@count")]))
show(p)
In [7]:
# Matplotlib version of the plots
def heng_li_plot(df, category, ax):
sub_df = df[df['category']==category]
#correct = sub_df[(sub_df['derr']=='0 < d <= 50') | (sub_df['derr']=='d = 0')][['MQ', 'count']].groupby('MQ').sum()
correct = sub_df[sub_df['derr']=='d = 0'][['MQ', 'count']].groupby('MQ').sum()
correct.columns = ['correct']
mapped = sub_df[sub_df['derr']!='unmapped'][['MQ', 'count']].groupby('MQ').sum()
mapped.columns = ['mapped']
total = sub_df[['MQ', 'count']].groupby('MQ').sum()
total.columns = ['total']
data = pd.concat((correct, mapped, total), axis=1)
x = np.zeros(61, dtype=float)
y = np.zeros(61, dtype=float)
for mq in range(61):
data_sub = data.iloc[mq:70]
x[mq] = 100 * data_sub['mapped'].sum() / total.sum()
y[mq] = 100 * data_sub['correct'].sum() / data_sub['mapped'].sum()
ax.plot(x, y)
ax.plot(x[0], y[0], 'ko')
plt.setp(ax,
xlim=(95, 101), xticks=range(96,101),
ylim=(79, 101), yticks=range(80,101, 5),
title=category)
def plot_heng_li_panels(df):
fig = plt.figure(figsize=(10, 20))
for n, cat in enumerate(['Ref', 'SNP', 'Multi',
'INS <= 10', 'INS 11-50', 'INS > 50',
'DEL <= 10', 'DEL 11-50', 'DEL > 50']):
ax = plt.subplot(4, 3, n + 1)
heng_li_plot(df, cat, ax)
#plt.setp(ax, yscale='log')
if n != 6:
plt.setp(ax, xticklabels=[], yticklabels=[])
else:
plt.setp(ax, xlabel='% Mapped', ylabel='% Correct')
#plot_heng_li_panels(df)
In [8]:
def heng_li_plot(df, category):
sub_df = df[df['category']==category]
#correct = sub_df[(sub_df['derr']=='0 < d <= 50') | (sub_df['derr']=='d = 0')][['MQ', 'count']].groupby('MQ').sum()
correct = sub_df[sub_df['derr']=='d = 0'][['MQ', 'count']].groupby('MQ').sum()
correct.columns = ['correct']
mapped = sub_df[sub_df['derr']!='unmapped'][['MQ', 'count']].groupby('MQ').sum()
mapped.columns = ['mapped']
total = sub_df[['MQ', 'count']].groupby('MQ').sum()
total.columns = ['total']
data = pd.concat((correct, mapped, total), axis=1)
x = np.zeros(61, dtype=float)
y = np.zeros(61, dtype=float)
for mq in range(61):
data_sub = data.iloc[mq:70]
x[mq] = 100 * data_sub['mapped'].sum() / total.sum()
y[mq] = 100 * data_sub['correct'].sum() / data_sub['mapped'].sum()
source = ColumnDataSource(data=dict(
mapped=x,
correct=y,
mq=range(61)
))
hover = HoverTool(tooltips=[
('MQ', '≥ @mq'),
('Map', '@mapped %'),
('Correct', '@correct %')
])
s1 = figure(width=250, plot_height=250,
tools=[hover, 'pan', 'reset'],
title=category)
s1.circle(x[0], y[0], size=10)
s1.line('mapped', 'correct', source=source)
return s1
def plot_heng_li_panels(df):
s = []
row_cnt = 3
row_s = []
for n, cat in enumerate(['Ref', 'SNP', 'Multi',
'INS <= 10', 'INS 11-50', 'INS > 50',
'DEL <= 10', 'DEL 11-50', 'DEL > 50']):
if n % row_cnt == 0:
if len(row_s):
s.append(row_s)
row_s = []
row_s.append(heng_li_plot(df, cat))
if len(row_s):
s.append(row_s)
p = gridplot(s)
show(p)
plot_heng_li_panels(df)