In [13]:
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="show/hide"></form>''')
Out[13]:
In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Figure, Layout, Histogram
import os, glob
from os import listdir
from os.path import isfile, join
import yaml
init_notebook_mode(connected=True)
In [2]:
def prepend_file(filename, line):
with open(filename, 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(line.rstrip('\r\n') + '\n' + content)
In [3]:
config_file = "pipeline.config"
target_bed = ""
with open(config_file, 'r') as stream:
try:
target_bed = yaml.load(stream)['target_bed']
except yaml.YAMLError as exc:
print("Error with config file: " + exc)
In [4]:
bam_files = []
sample_names = []
current_dir = os.getcwd()
mypath = current_dir + "/" + "alignments/"
# for filename in glob.glob("bams/*.primary.primerclipped.bam"):
# files.append(filename)
# names.append(os.path.basename(filename))
for root, dirs, files in os.walk(mypath):
for cfile in files:
if cfile.endswith(".primary.primerclipped.bam"):
current_file = mypath + str(cfile)
bam_files.append(os.path.join(root, cfile))
temp_bam_file = os.path.basename(os.path.join(root, cfile))
sample_names.append(os.path.basename(cfile))
# file = None
output = "coverage/multicov.txt"
# # command = "bedtools multicov -bams " + ' '.join(files) + " " + "-bed CRC_10g_23May16.final.rover.bed > " + output
command = "bedtools multicov -bams " + ' '.join(bam_files) + " " + "-bed " + target_bed + " > " + output
# print command
!$command
sample_names.insert(0, "chr")
sample_names.insert(1, "start")
sample_names.insert(2, "end")
sample_names.insert(3, "block")
sample_names.insert(4, "name2")
# with file(output, 'r') as original: data = original.read()
new_header = str('\t'.join(sample_names)) + "\n"
# with file(output, 'w') as modified: modified.write(new_header + data)
prepend_file(output, new_header)
In [5]:
# GET GENE LIST
gene_list = sorted(set([x.split('\t')[3].split('_')[0] for x in open(target_bed).readlines()]))
In [6]:
coverage_file = output
# gene_list = ['POLD1', 'POLE', 'RNF43', 'FAN1', 'FAT1' ]
# gene_list = [ 'FAN1', 'FAT1', 'GMPS', 'NTHL1', 'OGG1', 'POLD1', 'POLE', 'POLQ', 'RNF43', 'TRIP4' ]
# gene_list = [ 'APC', 'APCProm1a', 'APCProm1b;APC', 'AXIN2', 'BMPR1A', 'BRCA1', 'BRCA2', 'GREM1', 'HOXB13G84E', 'MLH1', 'MSH2', 'MSH3', 'MSH6', 'MUTYH', 'NTHL1', 'PMS2', 'POLD1', 'POLE', 'POLE2', 'PTEN', 'RNF43', 'RNF43WRE1', 'RNF43WRE2', 'RPS20', 'SMAD4', 'STK11', 'TP53' ]
df_cov = pd.read_csv(coverage_file, sep='\t', header=0, index_col="block")
df_cov = df_cov.drop('chr', 1)
df_cov = df_cov.drop('start', 1)
df_cov = df_cov.drop('end', 1)
df_cov = df_cov.drop('name2', 1)
df_cov = df_cov.fillna(-1)
#samples = names[5:]
samples = list(df_cov)
In [7]:
# for gene in ['POLD1', 'POLE', 'RNF43']:
# for gene in ['POLD1', 'POLE', 'RNF43', 'FAN1', 'FAT1' ]:
df_temp = df_cov.copy()
for gene in gene_list:
df_temp = df_cov[[gene in s for s in df_cov.index]]
#df_temp = np.log(df_temp)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100000]
group_names = ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']
for index, row in df_temp.iterrows():
for sample in samples:
if(row[sample]<=30):
df_temp.set_value(index, sample, 0)
elif(row[sample]>30 and row[sample]<=100):
df_temp.set_value(index, sample, 50)
else:
df_temp.set_value(index, sample, 100)
#for column in df_temp.columns:
#categories = pd.cut(df_temp[column], bins, labels=group_names)
#df_temp[column] = pd.cut(df_temp[column], bins, labels=group_names)
trace0 = go.Heatmap(
z = df_temp.values.tolist(),
y = df_temp.index.values,
x = list(df_temp),
opacity = 0.7,
colorscale = [
# Let first 10% (0.1) of the values have color rgb(0, 0, 0)
[0.0, 'rgb(77,0,75)'],
[0.1, 'rgb(77,0,75)'],
# Let values between 10-20% of the min and max of z
# have color rgb(20, 20, 20)
[0.10, 'rgb(129,15,124)'],
[0.20, 'rgb(129,15,124)'],
# Values between 20-30% of the min and max of z
[0.20, 'rgb(136,65,157)'],
[0.30, 'rgb(136,65,157)'],
[0.30, 'rgb(140,107,177)'],
[0.40, 'rgb(140,107,177)'],
[0.40, 'rgb(140,150,198)'],
[0.50, 'rgb(140,150,198)'],
[0.50, 'rgb(158,188,218)'],
[0.60, 'rgb(158,188,218)'],
[0.60, 'rgb(191,211,230)'],
[0.70, 'rgb(191,211,230)'],
[0.70, 'rgb(224,236,244)'],
[0.80, 'rgb(224,236,244)'],
[0.80, 'rgb(247,252,253)'],
[1.0, 'rgb(247,252,253)']
],
showscale = False
)
data = [trace0]
title_text = gene + " - Per Block Coverage (source: BAM file)"
layout = go.Layout(
title = title_text,
#autosize=True,
width=800,
height=600,
xaxis=dict(
autorange=True,
title='Samples',
titlefont=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
),
yaxis=dict(
#type='log',
title='Coverage',
titlefont=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
)
)
fig = {
'data': data,
'layout': layout,
}
iplot(fig)
In [17]:
df = pd.read_csv(coverage_file, sep='\t', header=0, index_col="block")
df = df.drop('chr', 1)
df = df.drop('start', 1)
df = df.drop('end', 1)
df = df.drop('name2', 1)
df = df.fillna(-1)
# minimum coverage
min_cov = 30
# df2 = df.head(15)
df2 = df
# sLength = len(df2['0151030001-B069P34_S1.sort.bam'])
############
# ADD COLUMN "l_10" = PERCENT OF SAMPLES WITH LOW COVERAGE
df2 = df2.assign(l_10=df2.apply(lambda x: (1.0*(x < min_cov).sum())/len(x)*100.0, axis=1))
df2.sort_values(['l_10'], ascending=[0], inplace=True)
############
# df2
trace0 = go.Bar(
x = df2.index.values[:-1],
y = df2.l_10,
)
############
# ADD ROW "l_10" = PERCENT OF SAMPLES WITH LOW COVERAGE
df2.drop('l_10', axis=1, inplace=True)
# df2.sum(axis=0)
# df2.sort_values(['l_10'], ascending=[0], inplace=True)
df2.loc['total'] = pd.Series(df2.apply(lambda x: 100*((x < min_cov).sum())/(len(x)), axis=0))
############
df3 = pd.DataFrame(df2.loc['total'])
df3.sort_values('total', ascending=[0], inplace=True)
trace1 = go.Bar(
x = df3.index,
y = df3.total,
)
data = [trace0]
title_text_1 = "Per amplicon coverage"
title_text_2 = "Per sample coverage"
layout1 = go.Layout(
title = title_text_1,
#autosize=True,
width=800,
height=600,
xaxis=dict(
autorange=True,
title='Amplicons',
titlefont=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
),
yaxis=dict(
#type='log',
title='Percentage of samples with low coverage (<30)',
titlefont=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
)
)
fig = {
'data': data,
'layout': layout1,
}
data = [trace1]
layout2 = go.Layout(
title = title_text_2,
#autosize=True,
width=800,
height=600,
xaxis=dict(
autorange=True,
title='Amplicons',
titlefont=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
),
yaxis=dict(
#type='log',
title='Percentage of amplicons with low coverage (<30)',
titlefont=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
)
)
fig2 = {
'data': data,
'layout': layout2,
}
iplot(fig)
iplot(fig2)
In [9]:
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100000]
# group_names = ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']
# for index, row in df_cov.iterrows():
# for sample in samples:
# if(row[sample]<=30):
# df_cov.set_value(index, sample, 0)
# elif(row[sample]>30 and row[sample]<50):
# df_cov.set_value(index, sample, 50)
# else:
# df_cov.set_value(index, sample, 100)
# # for column in df_cov.columns:
# # categories = pd.cut(df_cov[column], bins, labels=group_names)
# # df_cov[column] = pd.cut(df_cov[column], bins, labels=group_names)
# trace0 = go.Heatmap(
# z = df_cov.values.tolist(),
# y = df_cov.index.values,
# x = list(df_cov),
# opacity = 0.7,
# colorscale = [
# [0.0, 'rgb(0,0,0)'],
# [0.5, 'rgb(0.5,0.094,0.48)'],
# [1.0, 'rgb(1.0,1.0,1.0)']
# ],
# showscale = False
# )
# data = [trace0]
# title_text = "Per Block Coverage (source: BAM file)"
# layout = go.Layout(
# title = title_text,
# #autosize=True,
# width=800,
# height=600,
# xaxis=dict(
# autorange=True,
# title='Samples',
# titlefont=dict(
# family='Courier New, monospace',
# size=12,
# color='#7f7f7f'
# )
# ),
# yaxis=dict(
# #type='log',
# title='Coverage',
# titlefont=dict(
# family='Courier New, monospace',
# size=12,
# color='#7f7f7f'
# )
# )
# )
# fig = {
# 'data': data,
# 'layout': layout,
# }
# iplot(fig)
In [16]:
# # for gene in ['POLD1', 'POLE', 'RNF43']:
# # for gene in ['POLD1', 'POLE', 'RNF43', 'FAN1', 'FAT1' ]:
# df_temp = df_cov.copy()
# for gene in gene_list:
# df_temp = df_cov[[gene in s for s in df_cov.index]]
# #df_temp = np.log(df_temp)
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100000]
# group_names = ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']
# for index, row in df_temp.iterrows():
# for sample in samples:
# if(row[sample]<=30):
# df_temp.set_value(index, sample, 0)
# elif(row[sample]>30 and row[sample]<=100):
# df_temp.set_value(index, sample, 50)
# else:
# df_temp.set_value(index, sample, 100)
# #for column in df_temp.columns:
# #categories = pd.cut(df_temp[column], bins, labels=group_names)
# #df_temp[column] = pd.cut(df_temp[column], bins, labels=group_names)
# trace0 = go.Table(
# header=dict(values=df_temp.columns,
# fill = dict(color='#C2D4FF'),
# align = ['left'] * 5),
# cells=dict(values=[df_temp.index.values],
# fill = dict(color='#F5F8FF'),
# align = ['left'] * 5)
# )
# data = [trace0]
# title_text = gene + " - Per Block Coverage (source: BAM file)"
# # layout = go.Layout(
# # title = title_text,
# # #autosize=True,
# # width=800,
# # height=600,
# # xaxis=dict(
# # autorange=True,
# # title='Samples',
# # titlefont=dict(
# # family='Courier New, monospace',
# # size=12,
# # color='#7f7f7f'
# # )
# # ),
# # yaxis=dict(
# # #type='log',
# # title='Coverage',
# # titlefont=dict(
# # family='Courier New, monospace',
# # size=12,
# # color='#7f7f7f'
# # )
# # )
# # )
# fig = {
# 'data': data
# # 'layout': layout,
# }
# iplot(fig)
In [ ]: