In [ ]:
from __future__ import division
%matplotlib inline
import os
import math
import numpy as np
import matplotlib.pyplot as plt
import pylab
from matplotlib import rc
import scipy as sc
import pandas as pd
import scipy.stats as stats
from clean_data import CANCER_TYPES

from IPython.html.widgets import interact
from IPython.html import widgets
from IPython.display import display

from mutation_analysis import get_pancancer_mutation_summary

Correlation Between Silent and Non-Silent Mutations

  • Count number of silent and non-silent mutations for every SAMPLE across all genes and compute correlation
  • Count number of silent and non-silent mutations for every GENE across samples and compute correlation

In [ ]:
can_types = []
for can in CANCER_TYPES:
    f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
    f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
    if os.path.exists(f1) and os.path.exists(f2):
        can_types.append(can)

print "There are %d cancer types ready to be analysed" % len(can_types)

@interact(cancer_type=widgets.DropdownWidget(values=can_types), Gene_or_Sample=widgets.DropdownWidget(values={'GENE':1, 'SAMPLE':0}))
def plot_corr(cancer_type, Gene_or_Sample):
    can = cancer_type
    f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
    nsdf = pd.read_table(f1, header=0, index_col=0)
    nsilent = nsdf.sum(axis=Gene_or_Sample)

    f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
    sdf = pd.read_table(f2, header=0, index_col=0)
    silent = sdf.sum(axis=Gene_or_Sample)


    
    print "# Total non-silent mutation: %d, # total silent mutation: %d" %(sum(nsilent), sum(silent))
    print "# Per SAMPLE non-silent mutation: %f, # silent mutation: %f" %(sum(nsilent)/sdf.shape[1], sum(silent)/sdf.shape[1])
    print "# Per GENE non-silent mutation: %f, # silent mutation: %f" %(sum(nsilent)/nsdf.shape[0], sum(silent)/nsdf.shape[0])
    print nsdf.shape, sdf.shape

    total = silent + nsilent
    
    if Gene_or_Sample:
        txt = 'GENE'
    else:
        txt = 'SAMPLE'
            
    pylab.rcParams['figure.figsize'] = (12.0, 8.0)
    plt.subplot(3, 1, 1)
    plt.plot(range(1, len(silent)+1), silent/sdf.count(axis=Gene_or_Sample), label='Silent Mutations')
    plt.plot(range(1, len(nsilent)+1), nsilent/nsdf.count(axis=Gene_or_Sample), label='Non-Silent Mutation')
    plt.ylabel("Mutation Frequency")
    plt.title(can + " silent vs non-silent mutations for every " + txt + " Spearman's corr.: " + str(silent.corr(nsilent, 'spearman')) + "\n")
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#     plt.yscale('log')
    plt.grid()
    
    # High frequency mutations
    plt.subplot(3, 1, 2)
    nhigh_freq = nsilent.order(ascending=False).iloc[:20]*100/nsdf.shape[int(Gene_or_Sample)]
    nhigh_freq.plot(kind='bar', rot=30)
    plt.ylabel("Non-Silent Mutation Freq(%)")
    plt.xlabel(txt)
    plt.grid()

    # High frequency mutations silent + non-silent
    plt.subplot(3, 1, 3)
    thigh_freq = total.order(ascending=False).iloc[:20]*100/sdf.shape[int(Gene_or_Sample)]
    thigh_freq.plot(kind='bar', rot=30)
    plt.ylabel("Overall Mutation Freq(%)")
    plt.xlabel(txt)
    plt.grid()

    plt.tight_layout(pad=0.2, w_pad=0.5, h_pad=0.2)
    #print df.shape[int(Gene_or_Sample)]

Observations

  1. A very high correlation between the total number of silent and non-silent mutation is observed for every patient. This implies that if one considers each patient individually then it will be hard to separate drivers from passenger mutations. This appears reasonable as very often cancer messes up the DNA repair mechanisms.
  2. In contrast to the above, the gene-wise correlation between the total number of silent and non-silent mutations across patient population is weak. This suggests that different patients harobor different set of mutations. In other words, only very few genes show high frequency of mutation across patients which makes it tough to identify them.
  3. Genes with very high silent mutation freq should be neglected????
  4. Several genes have very high mutation frequency in LUAD and LUSC. Whats wrong?

TODO

  1. Does silent mutations have higher probability because of codon usage reduandancy? Read Identifying cancer driver genes in tumor genome sequencing studies, Richard simon, bioinformatics
  2. Box plot mutation per gene and mutation per sample across cancer types

In [ ]:
gene = 0; refresh = True    
ns_summary, s_summary = get_pancancer_mutation_summary(gene, refresh)
outdir = '../data/pancancer'
if gene:
    ns_outf = outdir + os.sep + 'non_silent_mutations_genes_all.txt'               
    s_outf = outdir + os.sep + 'silent_mutations_genes_all.txt' 
    outf='pancancer_genewise_mutation_boxplot.png'
else:
    ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'               
    s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt' 
    outf='pancancer_samplewise_mutation_boxplot.png'
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)
fig = plotly_boxplots(ns_summary, s_summary, outf)
#     plot_url = py.plot(fig, filename='box-grouped')
#     py.iplot(fig, filename = 'basic-line')

In [ ]:
import plotly
import plotly.plotly as py
from plotly.graph_objs import *

plotly.tools.set_credentials_file(username='extra.nitin', api_key='auykqr8hfa', stream_ids=['kvd6yka2q7', 'u6wv80rlbm'])

def plotly_boxplots(ns_summary, s_summary, outf):
    """ Plot Non-Silent and Silent Mutations box-plot across all
    cancer types
    """
    s_stacked = s_summary.stack()
    s_stacked.index = s_stacked.index.droplevel(0)
    s_stacked = pd.DataFrame(s_stacked).sort_index()
    silent_x = list(s_stacked.index)
    silent_y = list(s_stacked.values)

    ns_stacked = ns_summary.stack()
    ns_stacked.index = ns_stacked.index.droplevel(0)
    ns_stacked = pd.DataFrame(ns_stacked).sort_index()
    nsilent_x = list(ns_stacked.index)
    nsilent_y = list(ns_stacked.values)

    # Now plot the box plots
    trace1 = Box(
        y=silent_y,
        x=silent_x,
        name='Silent',
        marker=Marker(
            color='#3D9970'
        )
    )
    trace2 = Box(
        y=nsilent_y,
        x=nsilent_x,
        name='Non-Silent',
        marker=Marker(
            color='#FF4136'
        )
    )

    data = Data([trace1, trace2])
    layout = Layout(
        title='Mutation Landscape of the TCGA cancer cohorts',
        titlefont=Font(
                size=26,
            ),
        xaxis=XAxis(
            tickangle=-45,
            tickfont=Font(
                size=18,
            ),
        ),
        yaxis=YAxis(
            title='Number of Mutations',
            titlefont=Font(
                size=22,
            ),
            zeroline=False,
            type='log',
            autorange=True
        ),
        autosize=False,
        width=1800,
        height=1200,
        boxmode='group'
    )
    fig = Figure(data=data, layout=layout)

    pylab.rcParams['figure.figsize'] = (18.0, 12.0)
    py.image.ishow(fig)
    out_png = '../figures/' + outf
    py.image.save_as(fig, out_png)

    return fig

In [ ]:
gene = 0; refresh = True    
ns_summary, s_summary = get_pancancer_mutation_summary(gene, refresh)
outdir = '../data/pancancer'                                                   
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'               
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt' 
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)


pylab.rcParams['figure.figsize'] = (18.0, 12.0)
fig, axes = plt.subplots(nrows=2, ncols=1)
ns_summary.plot(ax=axes[0], kind='box', logy=True, rot=30); axes[0].set_ylabel('Number of Non-Silent Mutation in Samples')
s_summary.plot(ax=axes[1], kind='box', logy=True, rot=30); axes[1].set_ylabel('Number of Silent Mutation in Samples');

In [ ]:
import brewer2mpl

outdir = '../data/pancancer'                                                   
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'               
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt' 
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)

both_df = ns_summary.join(s_summary, lsuffix='_NS')

plt.close('all')
fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(18, 18), dpi=1200)
pylab.rcParams['figure.max_open_warning'] = 0
set2 = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors
for idx, c in enumerate(ns_summary.columns):
    tmp_df = both_df[[c+'_NS', c]].dropna()#.fillna(0)
    x = tmp_df[c].values
    y = tmp_df[c+'_NS'].values
    i = idx//5; j = idx%5
    axes[i, j].scatter(x, y, color=set2[idx%8])
    axes[i, j].set_xscale('log')
    axes[i, j].set_yscale('log')
    axes[i, j].set_title('%s %.2f, %.2f' %(c, stats.pearsonr(x, y)[0], stats.spearmanr(x,y)[0]))
    
    if i==2 and j == 0:
        axes[i, j].set_ylabel('Non-Silent Mutations', size=20)
    if i==4 and j == 2:
        axes[i, j].set_xlabel('Silent Mutations', size=20)

    axes[i, j].grid()
# fig.suptitle('Non-Silent v/s Silent Mutations: Pearson and Spearman Correlations\n', size=12)
plt.tight_layout()
plt.show()
fig.savefig('../figures/pancancer_mutation_individual_correlation.eps', format='eps')

In [ ]:
import brewer2mpl

outdir = '../data/pancancer'                                                   
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'               
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt' 
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)


both_df = ns_summary.join(s_summary, lsuffix='_NS')

# pylab.rcParams['figure.max_open_warning'] = 0
# pylab.rcParams['figure.figsize'] = (22.0, 12.0)

set2 = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors
markers = ['x', 'o', '*']
fig, ax = plt.subplots(1, figsize=(16, 14), dpi=1200)
for i, c in enumerate(ns_summary.columns):
    tmp_df = both_df[[c+'_NS', c]].dropna()#.fillna(0)
    x = tmp_df[c].values
    y = tmp_df[c+'_NS'].values
    ax.scatter(x, y, color=set2[i%8], marker=markers[i%3], label='%s, P: %.2f, S: %.2f' %(c, stats.pearsonr(x, y)[0], stats.spearmanr(x,y)[0]))
ax.set_xscale('log')
ax.set_yscale('log')

ax.set_xlabel('Silent Mutations', size=18)
ax.set_ylabel('Non-Silent Mutations', size=18)

# The overall correlataion
xa = ns_summary.fillna(0).sum(1)
ya = s_summary.fillna(0).sum(1)
common = xa.index.intersection(ya.index)
xa = xa[common].values
ya = ya[common].values
plt.title('Pancancer Non-Silent v/s Silent Mutations Correlation Pearson: %.2f, Spearman: %.2f'  %(stats.pearsonr(xa, ya)[0], stats.spearmanr(xa,ya)[0]), size=20)
# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
#           ncol=5, fancybox=True, shadow=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(loc='upper left')
plt.grid()

fig.savefig('../figures/pancancer_mutation_aggregated_correlation.eps', format='eps')

Check gene level enrichments

Apply Fisher's exact test. Null Hypothesis: Silent and Non silent mutations have same distribution


In [ ]:
for can in CANCER_TYPES:
    f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
    f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
    if os.path.exists(f1) and os.path.exists(f2):
        can_types.append(can)

print "There are %d cancer types ready to be analysed" % len(can_types)

#@interact(cancer_type=widgets.DropdownWidget(values=can_types), Gene_or_Sample=widgets.DropdownWidget(values={'GENE':1, 'SAMPLE':0}))
def plot_corr(cancer_type, Gene_or_Sample):
    can = cancer_type
    f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
    nsdf = pd.read_table(f1, header=0, index_col=0)
    nsilent = nsdf.sum(axis=Gene_or_Sample)*-1
    
    f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
    sdf = pd.read_table(f2, header=0, index_col=0)
    silent = sdf.sum(axis=Gene_or_Sample)

    nsamples = nsdf.shape[1]
    return (nsilent, silent, nsamples)

    
#     print "# Total non-silent mutation: %d, # total silent mutation: %d" %(sum(nsilent), sum(silent))
#     print "# Per SAMPLE non-silent mutation: %f, # silent mutation: %f" %(sum(nsilent)/sdf.shape[1], sum(silent)/sdf.shape[1])
#     print "# Per GENE non-silent mutation: %f, # total silent mutation: %f" %(sum(nsilent)/nsdf.shape[0], sum(silent)/nsdf.shape[0])
#     print nsdf.shape, sdf.shape

#     total = silent + nsilent
    
#     if Gene_or_Sample:
#         txt = 'GENE'
#     else:
#         txt = 'SAMPLE'
            
#     pylab.rcParams['figure.figsize'] = (12.0, 8.0)
#     plt.subplot(3, 1, 1)
#     plt.plot(range(1, len(silent)+1), silent, label='Silent Mutations')
#     plt.plot(range(1, len(nsilent)+1), nsilent, label='Non-Silent Mutation')
#     plt.ylabel("Mutated gene count")
#     plt.title(can + " silent vs non-silent mutations for every " + txt + " Spearman's corr.: " + str(silent.corr(nsilent, 'spearman')) + "\n")
#     plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#     plt.grid()
    
#     # High frequency mutations
#     plt.subplot(3, 1, 2)
#     nhigh_freq = nsilent.order(ascending=False).iloc[:20]*100/nsdf.shape[int(Gene_or_Sample)]
#     nhigh_freq.plot(kind='bar', rot=30)
#     plt.ylabel("Non-Silent Mutation Freq(%)")
#     plt.xlabel(txt)
#     plt.grid()

#     # High frequency mutations silent + non-silent
#     plt.subplot(3, 1, 3)
#     thigh_freq = total.order(ascending=False).iloc[:20]*100/sdf.shape[int(Gene_or_Sample)]
#     thigh_freq.plot(kind='bar', rot=30)
#     plt.ylabel("Overall Mutation Freq(%)")
#     plt.xlabel(txt)
#     plt.grid()

#     plt.tight_layout(pad=0.2, w_pad=0.5, h_pad=0.2)
#     print df.shape[int(Gene_or_Sample)]