notebook.community

Edit and run



In [13]:

    
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="show/hide"></form>''')









    Out[13]:



In [1]:

    
import pandas as pd
import numpy as np

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Figure, Layout, Histogram

import os, glob
from os import listdir
from os.path import isfile, join

import yaml

init_notebook_mode(connected=True)



In [2]:

    
def prepend_file(filename, line):
    with open(filename, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(line.rstrip('\r\n') + '\n' + content)



In [3]:

    
config_file = "pipeline.config"
target_bed = ""
with open(config_file, 'r') as stream:
    try:
        target_bed = yaml.load(stream)['target_bed']
    except yaml.YAMLError as exc:
        print("Error with config file: " + exc)



In [4]:

    
bam_files = []
sample_names = []

current_dir = os.getcwd()
mypath = current_dir + "/" + "alignments/"

# for filename in glob.glob("bams/*.primary.primerclipped.bam"):
#     files.append(filename)    
#     names.append(os.path.basename(filename))

for root, dirs, files in os.walk(mypath):
    for cfile in files:
        if cfile.endswith(".primary.primerclipped.bam"):
            current_file = mypath + str(cfile)
            bam_files.append(os.path.join(root, cfile))
            temp_bam_file = os.path.basename(os.path.join(root, cfile))
            sample_names.append(os.path.basename(cfile))
# file = None            

output = "coverage/multicov.txt"
# # command = "bedtools multicov -bams " + ' '.join(files) + " " + "-bed CRC_10g_23May16.final.rover.bed > " + output
command = "bedtools multicov -bams " + ' '.join(bam_files) + " " + "-bed " + target_bed + " > " + output
# print command
!$command
sample_names.insert(0, "chr")
sample_names.insert(1, "start")
sample_names.insert(2, "end")
sample_names.insert(3, "block")
sample_names.insert(4, "name2")
# with file(output, 'r') as original: data = original.read()
new_header = str('\t'.join(sample_names)) + "\n"
# with file(output, 'w') as modified: modified.write(new_header + data)
prepend_file(output, new_header)



In [5]:

    
# GET GENE LIST
gene_list = sorted(set([x.split('\t')[3].split('_')[0] for x in open(target_bed).readlines()]))



In [6]:

    
coverage_file = output
# gene_list = ['POLD1', 'POLE', 'RNF43', 'FAN1', 'FAT1' ]
# gene_list = [ 'FAN1', 'FAT1', 'GMPS', 'NTHL1', 'OGG1', 'POLD1', 'POLE', 'POLQ', 'RNF43', 'TRIP4' ]
# gene_list = [ 'APC', 'APCProm1a', 'APCProm1b;APC', 'AXIN2', 'BMPR1A', 'BRCA1', 'BRCA2', 'GREM1', 'HOXB13G84E', 'MLH1', 'MSH2', 'MSH3', 'MSH6', 'MUTYH', 'NTHL1', 'PMS2', 'POLD1', 'POLE', 'POLE2', 'PTEN', 'RNF43', 'RNF43WRE1', 'RNF43WRE2', 'RPS20', 'SMAD4', 'STK11', 'TP53' ]
df_cov = pd.read_csv(coverage_file, sep='\t', header=0, index_col="block")
df_cov = df_cov.drop('chr', 1)
df_cov = df_cov.drop('start', 1)
df_cov = df_cov.drop('end', 1)
df_cov = df_cov.drop('name2', 1)
df_cov = df_cov.fillna(-1)
#samples = names[5:]
samples = list(df_cov)



In [7]:

    
# for gene in ['POLD1', 'POLE', 'RNF43']:        
# for gene in ['POLD1', 'POLE', 'RNF43', 'FAN1', 'FAT1' ]:
df_temp = df_cov.copy()
for gene in gene_list:
    df_temp = df_cov[[gene in s for s in df_cov.index]]
    #df_temp = np.log(df_temp)
    
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100000]
    group_names = ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']    
    
    for index, row in df_temp.iterrows():
        for sample in samples:
            if(row[sample]<=30):
                df_temp.set_value(index, sample, 0)
            elif(row[sample]>30 and row[sample]<=100):
                df_temp.set_value(index, sample, 50)
            else:
                df_temp.set_value(index, sample, 100)
    
    #for column in df_temp.columns:
        #categories = pd.cut(df_temp[column], bins, labels=group_names)
        #df_temp[column] = pd.cut(df_temp[column], bins, labels=group_names)
    
    trace0 = go.Heatmap(
        z = df_temp.values.tolist(),
        y = df_temp.index.values,
        x = list(df_temp),
        opacity = 0.7,        
        colorscale = [
            # Let first 10% (0.1) of the values have color rgb(0, 0, 0)
            [0.0, 'rgb(77,0,75)'],
            [0.1, 'rgb(77,0,75)'],
            
            # Let values between 10-20% of the min and max of z
            # have color rgb(20, 20, 20)
            [0.10, 'rgb(129,15,124)'],
            [0.20, 'rgb(129,15,124)'],
            
            # Values between 20-30% of the min and max of z
            [0.20, 'rgb(136,65,157)'],
            [0.30, 'rgb(136,65,157)'],
            
            [0.30, 'rgb(140,107,177)'],
            [0.40, 'rgb(140,107,177)'],
            
            [0.40, 'rgb(140,150,198)'],
            [0.50, 'rgb(140,150,198)'],
            
            [0.50, 'rgb(158,188,218)'],
            [0.60, 'rgb(158,188,218)'],
            
            [0.60, 'rgb(191,211,230)'],
            [0.70, 'rgb(191,211,230)'],
            
            [0.70, 'rgb(224,236,244)'],
            [0.80, 'rgb(224,236,244)'],
            
            [0.80, 'rgb(247,252,253)'],
            [1.0, 'rgb(247,252,253)']
        ],
        showscale = False
    )

    data = [trace0]

    title_text = gene + " - Per Block Coverage (source: BAM file)"

    layout = go.Layout(
        title = title_text,
        #autosize=True,
        width=800,
        height=600,    
        xaxis=dict(
            autorange=True,
            title='Samples',
            titlefont=dict(
                family='Courier New, monospace',
                size=12,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            #type='log',
            title='Coverage',
            titlefont=dict(
                family='Courier New, monospace',
                size=12,
                color='#7f7f7f'
            )
        )
    )
    
    fig = {
        'data': data,
        'layout': layout,        
    }

    iplot(fig)



In [17]:

    
df = pd.read_csv(coverage_file, sep='\t', header=0, index_col="block")
df = df.drop('chr', 1)
df = df.drop('start', 1)
df = df.drop('end', 1)
df = df.drop('name2', 1)
df = df.fillna(-1)

# minimum coverage
min_cov = 30

# df2 = df.head(15)
df2 = df
# sLength = len(df2['0151030001-B069P34_S1.sort.bam'])

############
# ADD COLUMN "l_10" = PERCENT OF SAMPLES WITH LOW COVERAGE
df2 = df2.assign(l_10=df2.apply(lambda x: (1.0*(x < min_cov).sum())/len(x)*100.0, axis=1))
df2.sort_values(['l_10'], ascending=[0], inplace=True)
############
# df2

trace0 = go.Bar(
    x = df2.index.values[:-1],
    y = df2.l_10,
)

############
# ADD ROW "l_10" = PERCENT OF SAMPLES WITH LOW COVERAGE
df2.drop('l_10', axis=1, inplace=True)
# df2.sum(axis=0)
# df2.sort_values(['l_10'], ascending=[0], inplace=True)
df2.loc['total'] = pd.Series(df2.apply(lambda x: 100*((x < min_cov).sum())/(len(x)), axis=0))
############
df3 = pd.DataFrame(df2.loc['total'])
df3.sort_values('total', ascending=[0], inplace=True)

trace1 = go.Bar(
    x = df3.index,
    y = df3.total,
)

data = [trace0]

title_text_1 = "Per amplicon coverage"
title_text_2 = "Per sample coverage"

layout1 = go.Layout(
    title = title_text_1,
    #autosize=True,
    width=800,
    height=600,    
    xaxis=dict(
        autorange=True,
        title='Amplicons',
        titlefont=dict(
            family='Courier New, monospace',
            size=12,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        #type='log',
        title='Percentage of samples with low coverage (<30)',
        titlefont=dict(
            family='Courier New, monospace',
            size=12,
            color='#7f7f7f'
        )
    )
)

fig = {
    'data': data,
    'layout': layout1,        
}

data = [trace1]

layout2 = go.Layout(
    title = title_text_2,
    #autosize=True,
    width=800,
    height=600,    
    xaxis=dict(
        autorange=True,
        title='Amplicons',
        titlefont=dict(
            family='Courier New, monospace',
            size=12,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        #type='log',
        title='Percentage of amplicons with low coverage (<30)',
        titlefont=dict(
            family='Courier New, monospace',
            size=12,
            color='#7f7f7f'
        )
    )
)


fig2 = {
    'data': data,
    'layout': layout2,        
}

iplot(fig)
iplot(fig2)



In [9]:

    
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100000]
# group_names = ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']


# for index, row in df_cov.iterrows():
#     for sample in samples:
#         if(row[sample]<=30):
#             df_cov.set_value(index, sample, 0)
#         elif(row[sample]>30 and row[sample]<50):
#             df_cov.set_value(index, sample, 50)
#         else:
#             df_cov.set_value(index, sample, 100)

# # for column in df_cov.columns:
# #     categories = pd.cut(df_cov[column], bins, labels=group_names)
# #     df_cov[column] = pd.cut(df_cov[column], bins, labels=group_names)

# trace0 = go.Heatmap(
#     z = df_cov.values.tolist(),
#     y = df_cov.index.values,
#     x = list(df_cov),
#     opacity = 0.7,
#     colorscale = [
#         [0.0, 'rgb(0,0,0)'],
#         [0.5, 'rgb(0.5,0.094,0.48)'], 
#         [1.0, 'rgb(1.0,1.0,1.0)']
#     ],
#     showscale = False
# )
# data = [trace0]

# title_text = "Per Block Coverage (source: BAM file)"

# layout = go.Layout(
#     title = title_text,
#     #autosize=True,
#     width=800,
#     height=600,    
#     xaxis=dict(
#         autorange=True,
#         title='Samples',
#         titlefont=dict(
#             family='Courier New, monospace',
#             size=12,
#             color='#7f7f7f'
#         )
#     ),
#     yaxis=dict(
#         #type='log',
#         title='Coverage',
#         titlefont=dict(
#             family='Courier New, monospace',
#             size=12,
#             color='#7f7f7f'
#         )
#     )
# )

# fig = {
#     'data': data,
#     'layout': layout,        
# }

# iplot(fig)



In [16]:

    
# # for gene in ['POLD1', 'POLE', 'RNF43']:        
# # for gene in ['POLD1', 'POLE', 'RNF43', 'FAN1', 'FAT1' ]:
# df_temp = df_cov.copy()
# for gene in gene_list:
#     df_temp = df_cov[[gene in s for s in df_cov.index]]
#     #df_temp = np.log(df_temp)
    
#     bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100000]
#     group_names = ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']    
    
#     for index, row in df_temp.iterrows():
#         for sample in samples:
#             if(row[sample]<=30):
#                 df_temp.set_value(index, sample, 0)
#             elif(row[sample]>30 and row[sample]<=100):
#                 df_temp.set_value(index, sample, 50)
#             else:
#                 df_temp.set_value(index, sample, 100)
    
#     #for column in df_temp.columns:
#         #categories = pd.cut(df_temp[column], bins, labels=group_names)
#         #df_temp[column] = pd.cut(df_temp[column], bins, labels=group_names)
    
#     trace0 = go.Table(
#         header=dict(values=df_temp.columns,
#                     fill = dict(color='#C2D4FF'),
#                     align = ['left'] * 5),
#         cells=dict(values=[df_temp.index.values],
#                    fill = dict(color='#F5F8FF'),
#                    align = ['left'] * 5)
#     )

#     data = [trace0]

#     title_text = gene + " - Per Block Coverage (source: BAM file)"

# #     layout = go.Layout(
# #         title = title_text,
# #         #autosize=True,
# #         width=800,
# #         height=600,    
# #         xaxis=dict(
# #             autorange=True,
# #             title='Samples',
# #             titlefont=dict(
# #                 family='Courier New, monospace',
# #                 size=12,
# #                 color='#7f7f7f'
# #             )
# #         ),
# #         yaxis=dict(
# #             #type='log',
# #             title='Coverage',
# #             titlefont=dict(
# #                 family='Courier New, monospace',
# #                 size=12,
# #                 color='#7f7f7f'
# #             )
# #         )
# #     )
    
#     fig = {
#         'data': data
# #         'layout': layout,        
#     }

#     iplot(fig)



In [ ]: