In [ ]:
from __future__ import division
%matplotlib inline
import os
import math
import numpy as np
import matplotlib.pyplot as plt
import pylab
from matplotlib import rc
import scipy as sc
import pandas as pd
import scipy.stats as stats
from clean_data import CANCER_TYPES
from IPython.html.widgets import interact
from IPython.html import widgets
from IPython.display import display
from mutation_analysis import get_pancancer_mutation_summary
In [ ]:
can_types = []
for can in CANCER_TYPES:
f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
if os.path.exists(f1) and os.path.exists(f2):
can_types.append(can)
print "There are %d cancer types ready to be analysed" % len(can_types)
@interact(cancer_type=widgets.DropdownWidget(values=can_types), Gene_or_Sample=widgets.DropdownWidget(values={'GENE':1, 'SAMPLE':0}))
def plot_corr(cancer_type, Gene_or_Sample):
can = cancer_type
f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
nsdf = pd.read_table(f1, header=0, index_col=0)
nsilent = nsdf.sum(axis=Gene_or_Sample)
f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
sdf = pd.read_table(f2, header=0, index_col=0)
silent = sdf.sum(axis=Gene_or_Sample)
print "# Total non-silent mutation: %d, # total silent mutation: %d" %(sum(nsilent), sum(silent))
print "# Per SAMPLE non-silent mutation: %f, # silent mutation: %f" %(sum(nsilent)/sdf.shape[1], sum(silent)/sdf.shape[1])
print "# Per GENE non-silent mutation: %f, # silent mutation: %f" %(sum(nsilent)/nsdf.shape[0], sum(silent)/nsdf.shape[0])
print nsdf.shape, sdf.shape
total = silent + nsilent
if Gene_or_Sample:
txt = 'GENE'
else:
txt = 'SAMPLE'
pylab.rcParams['figure.figsize'] = (12.0, 8.0)
plt.subplot(3, 1, 1)
plt.plot(range(1, len(silent)+1), silent/sdf.count(axis=Gene_or_Sample), label='Silent Mutations')
plt.plot(range(1, len(nsilent)+1), nsilent/nsdf.count(axis=Gene_or_Sample), label='Non-Silent Mutation')
plt.ylabel("Mutation Frequency")
plt.title(can + " silent vs non-silent mutations for every " + txt + " Spearman's corr.: " + str(silent.corr(nsilent, 'spearman')) + "\n")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
# plt.yscale('log')
plt.grid()
# High frequency mutations
plt.subplot(3, 1, 2)
nhigh_freq = nsilent.order(ascending=False).iloc[:20]*100/nsdf.shape[int(Gene_or_Sample)]
nhigh_freq.plot(kind='bar', rot=30)
plt.ylabel("Non-Silent Mutation Freq(%)")
plt.xlabel(txt)
plt.grid()
# High frequency mutations silent + non-silent
plt.subplot(3, 1, 3)
thigh_freq = total.order(ascending=False).iloc[:20]*100/sdf.shape[int(Gene_or_Sample)]
thigh_freq.plot(kind='bar', rot=30)
plt.ylabel("Overall Mutation Freq(%)")
plt.xlabel(txt)
plt.grid()
plt.tight_layout(pad=0.2, w_pad=0.5, h_pad=0.2)
#print df.shape[int(Gene_or_Sample)]
In [ ]:
gene = 0; refresh = True
ns_summary, s_summary = get_pancancer_mutation_summary(gene, refresh)
outdir = '../data/pancancer'
if gene:
ns_outf = outdir + os.sep + 'non_silent_mutations_genes_all.txt'
s_outf = outdir + os.sep + 'silent_mutations_genes_all.txt'
outf='pancancer_genewise_mutation_boxplot.png'
else:
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt'
outf='pancancer_samplewise_mutation_boxplot.png'
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)
fig = plotly_boxplots(ns_summary, s_summary, outf)
# plot_url = py.plot(fig, filename='box-grouped')
# py.iplot(fig, filename = 'basic-line')
In [ ]:
import plotly
import plotly.plotly as py
from plotly.graph_objs import *
plotly.tools.set_credentials_file(username='extra.nitin', api_key='auykqr8hfa', stream_ids=['kvd6yka2q7', 'u6wv80rlbm'])
def plotly_boxplots(ns_summary, s_summary, outf):
""" Plot Non-Silent and Silent Mutations box-plot across all
cancer types
"""
s_stacked = s_summary.stack()
s_stacked.index = s_stacked.index.droplevel(0)
s_stacked = pd.DataFrame(s_stacked).sort_index()
silent_x = list(s_stacked.index)
silent_y = list(s_stacked.values)
ns_stacked = ns_summary.stack()
ns_stacked.index = ns_stacked.index.droplevel(0)
ns_stacked = pd.DataFrame(ns_stacked).sort_index()
nsilent_x = list(ns_stacked.index)
nsilent_y = list(ns_stacked.values)
# Now plot the box plots
trace1 = Box(
y=silent_y,
x=silent_x,
name='Silent',
marker=Marker(
color='#3D9970'
)
)
trace2 = Box(
y=nsilent_y,
x=nsilent_x,
name='Non-Silent',
marker=Marker(
color='#FF4136'
)
)
data = Data([trace1, trace2])
layout = Layout(
title='Mutation Landscape of the TCGA cancer cohorts',
titlefont=Font(
size=26,
),
xaxis=XAxis(
tickangle=-45,
tickfont=Font(
size=18,
),
),
yaxis=YAxis(
title='Number of Mutations',
titlefont=Font(
size=22,
),
zeroline=False,
type='log',
autorange=True
),
autosize=False,
width=1800,
height=1200,
boxmode='group'
)
fig = Figure(data=data, layout=layout)
pylab.rcParams['figure.figsize'] = (18.0, 12.0)
py.image.ishow(fig)
out_png = '../figures/' + outf
py.image.save_as(fig, out_png)
return fig
In [ ]:
gene = 0; refresh = True
ns_summary, s_summary = get_pancancer_mutation_summary(gene, refresh)
outdir = '../data/pancancer'
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt'
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)
pylab.rcParams['figure.figsize'] = (18.0, 12.0)
fig, axes = plt.subplots(nrows=2, ncols=1)
ns_summary.plot(ax=axes[0], kind='box', logy=True, rot=30); axes[0].set_ylabel('Number of Non-Silent Mutation in Samples')
s_summary.plot(ax=axes[1], kind='box', logy=True, rot=30); axes[1].set_ylabel('Number of Silent Mutation in Samples');
In [ ]:
import brewer2mpl
outdir = '../data/pancancer'
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt'
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)
both_df = ns_summary.join(s_summary, lsuffix='_NS')
plt.close('all')
fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(18, 18), dpi=1200)
pylab.rcParams['figure.max_open_warning'] = 0
set2 = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors
for idx, c in enumerate(ns_summary.columns):
tmp_df = both_df[[c+'_NS', c]].dropna()#.fillna(0)
x = tmp_df[c].values
y = tmp_df[c+'_NS'].values
i = idx//5; j = idx%5
axes[i, j].scatter(x, y, color=set2[idx%8])
axes[i, j].set_xscale('log')
axes[i, j].set_yscale('log')
axes[i, j].set_title('%s %.2f, %.2f' %(c, stats.pearsonr(x, y)[0], stats.spearmanr(x,y)[0]))
if i==2 and j == 0:
axes[i, j].set_ylabel('Non-Silent Mutations', size=20)
if i==4 and j == 2:
axes[i, j].set_xlabel('Silent Mutations', size=20)
axes[i, j].grid()
# fig.suptitle('Non-Silent v/s Silent Mutations: Pearson and Spearman Correlations\n', size=12)
plt.tight_layout()
plt.show()
fig.savefig('../figures/pancancer_mutation_individual_correlation.eps', format='eps')
In [ ]:
import brewer2mpl
outdir = '../data/pancancer'
ns_outf = outdir + os.sep + 'non_silent_mutations_samples_all.txt'
s_outf = outdir + os.sep + 'silent_mutations_samples_all.txt'
ns_summary = pd.read_table(ns_outf, index_col=0, header=0)
s_summary = pd.read_table(s_outf, index_col=0, header=0)
both_df = ns_summary.join(s_summary, lsuffix='_NS')
# pylab.rcParams['figure.max_open_warning'] = 0
# pylab.rcParams['figure.figsize'] = (22.0, 12.0)
set2 = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors
markers = ['x', 'o', '*']
fig, ax = plt.subplots(1, figsize=(16, 14), dpi=1200)
for i, c in enumerate(ns_summary.columns):
tmp_df = both_df[[c+'_NS', c]].dropna()#.fillna(0)
x = tmp_df[c].values
y = tmp_df[c+'_NS'].values
ax.scatter(x, y, color=set2[i%8], marker=markers[i%3], label='%s, P: %.2f, S: %.2f' %(c, stats.pearsonr(x, y)[0], stats.spearmanr(x,y)[0]))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Silent Mutations', size=18)
ax.set_ylabel('Non-Silent Mutations', size=18)
# The overall correlataion
xa = ns_summary.fillna(0).sum(1)
ya = s_summary.fillna(0).sum(1)
common = xa.index.intersection(ya.index)
xa = xa[common].values
ya = ya[common].values
plt.title('Pancancer Non-Silent v/s Silent Mutations Correlation Pearson: %.2f, Spearman: %.2f' %(stats.pearsonr(xa, ya)[0], stats.spearmanr(xa,ya)[0]), size=20)
# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
# ncol=5, fancybox=True, shadow=True)
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(loc='upper left')
plt.grid()
fig.savefig('../figures/pancancer_mutation_aggregated_correlation.eps', format='eps')
In [ ]:
for can in CANCER_TYPES:
f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
if os.path.exists(f1) and os.path.exists(f2):
can_types.append(can)
print "There are %d cancer types ready to be analysed" % len(can_types)
#@interact(cancer_type=widgets.DropdownWidget(values=can_types), Gene_or_Sample=widgets.DropdownWidget(values={'GENE':1, 'SAMPLE':0}))
def plot_corr(cancer_type, Gene_or_Sample):
can = cancer_type
f1 = '../data/processed/' + can + os.sep + can + '_nsilent_mutation.txt'
nsdf = pd.read_table(f1, header=0, index_col=0)
nsilent = nsdf.sum(axis=Gene_or_Sample)*-1
f2 = '../data/processed/' + can + os.sep + can + '_silent_mutation.txt'
sdf = pd.read_table(f2, header=0, index_col=0)
silent = sdf.sum(axis=Gene_or_Sample)
nsamples = nsdf.shape[1]
return (nsilent, silent, nsamples)
# print "# Total non-silent mutation: %d, # total silent mutation: %d" %(sum(nsilent), sum(silent))
# print "# Per SAMPLE non-silent mutation: %f, # silent mutation: %f" %(sum(nsilent)/sdf.shape[1], sum(silent)/sdf.shape[1])
# print "# Per GENE non-silent mutation: %f, # total silent mutation: %f" %(sum(nsilent)/nsdf.shape[0], sum(silent)/nsdf.shape[0])
# print nsdf.shape, sdf.shape
# total = silent + nsilent
# if Gene_or_Sample:
# txt = 'GENE'
# else:
# txt = 'SAMPLE'
# pylab.rcParams['figure.figsize'] = (12.0, 8.0)
# plt.subplot(3, 1, 1)
# plt.plot(range(1, len(silent)+1), silent, label='Silent Mutations')
# plt.plot(range(1, len(nsilent)+1), nsilent, label='Non-Silent Mutation')
# plt.ylabel("Mutated gene count")
# plt.title(can + " silent vs non-silent mutations for every " + txt + " Spearman's corr.: " + str(silent.corr(nsilent, 'spearman')) + "\n")
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
# plt.grid()
# # High frequency mutations
# plt.subplot(3, 1, 2)
# nhigh_freq = nsilent.order(ascending=False).iloc[:20]*100/nsdf.shape[int(Gene_or_Sample)]
# nhigh_freq.plot(kind='bar', rot=30)
# plt.ylabel("Non-Silent Mutation Freq(%)")
# plt.xlabel(txt)
# plt.grid()
# # High frequency mutations silent + non-silent
# plt.subplot(3, 1, 3)
# thigh_freq = total.order(ascending=False).iloc[:20]*100/sdf.shape[int(Gene_or_Sample)]
# thigh_freq.plot(kind='bar', rot=30)
# plt.ylabel("Overall Mutation Freq(%)")
# plt.xlabel(txt)
# plt.grid()
# plt.tight_layout(pad=0.2, w_pad=0.5, h_pad=0.2)
# print df.shape[int(Gene_or_Sample)]