In [1]:
from __future__ import division
import os
import math
%matplotlib inline
import numpy as np
import pylab
import matplotlib.pyplot as plt
from matplotlib import rc
import scipy.stats as stats
import pandas as pd
from clean_data import CANCER_TYPES
from IPython.html.widgets import interact
from IPython.html import widgets
from IPython.display import display
It is well known that the accumulation of the deleterious somatic mutations cause cancer. However, it has been observed that only very small number of genes are mutated with high frequency across tumor samples. The rest of mutations are found only in small number of patients. In other words mutational landscape of cancer has only few mountains and many hills (Vogelestien et. al., Science, 2013). It is hypothesized that there are only a few "driver" mutations that provide selective advantage to the tumor cells and the remaining mutations are passengers. The passenger mutations do not offer any selection benefit happen due to the random mutation process. The genes with driver and passenger mutations are also referred to as drivers and passengers as well.
However, there are many "hallmark" characteristics such as uncontrolled proliferation, evading immune response and programmed cell death, showing angiogenesis and metastasizing to remote tissues that are shared not only across all the tumors of a particular type but also across different cancers. Note that each of these characteristics requires cancer to manipulated multiple cellular pathways. Could it be achieved by alteration in only a handful number of gene? This seems unlikely. Then how to reconcile the both of these observations?
We wish to propose an alternative hypothesis. In addition to the "drivers" and "passengers" there is another class of mutations: "conductors". The conductor mutations are not drivers, but they assist driver mutations to cause cancers. They are distinct from passengers too. In contrast to the passenger mutations, the conductor mutations offer small selection advantage. In contrast to the driver mutations, they are larger in number and can have a great variety. On functional side they assist driver genes to acquire "hallmark" functionalities. Intuitively, since these conductor mutations need to provide only small selective advantage, there exists a wider range of options to choose from the genes involved in the "hallmark" pathways. The same idea can be understood from the perspective of pathways. The cancer is outcome of mis-regulated pathways that comprise several potential conductor genes in addition to a few driver and large number of potential passenger genes. Therefore, we propose to investigate mutation patterns aggregated at the level of pathways. At the minimum, such an analysis will complement extant gene-centric notion of driver genes.
With the above background, it is natural to ask if there are some pathways that show statistically significant difference between "silent" and "non-silent" mutation rates. In the following section, we attempt to identify pathways that indeed are enriched for non-silent mutations. This pan-cancer analysis is based on TCGA data.
The MAF files were downloaded from Broad Institute's GDAC website http://gdac.broadinstitute.org website. The MAF files were preprocessed with Firehose pipeline.
In [2]:
can_types = []
for c in CANCER_TYPES:
f1 = '../results/' + c + os.sep + 'silent_mutation_pathway_score.txt'
f2 = '../results/' + c + os.sep + 'nsilent_mutation_pathway_score.txt'
if os.path.exists(f1) and os.path.exists(f2):
can_types.append(c)
print "There are %d cancer types ready to be analysed" % len(can_types)
can_type_wid = widgets.DropdownWidget(description="Select Cancer Type", values=can_types)
display(can_type_wid)
In [11]:
can = can_type_wid.value
opt = 'silent'
input_fpath = '../results/' + can + os.sep + opt +'_mutation_pathway_score.txt'
sdf = pd.read_table(input_fpath, sep='\t', header=0, index_col=0)
opt = 'nsilent'
input_fpath = '../results/' + can + os.sep + opt +'_mutation_pathway_score.txt'
nsdf = pd.read_table(input_fpath, sep='\t', header=0, index_col=0)
# Take care of the fact that non-silent mutation has a prior probability that is
# twice of the silent mutation probability
nsdf = nsdf/1
print nsdf.shape
common = nsdf.index.intersection(sdf.index)
res = pd.Series(index=sdf.index)
fwer = nsdf.shape[0]
for p in common:
res.loc[p] = -1*math.log10(stats.ttest_ind(sdf.loc[p], nsdf.loc[p], equal_var=False)[1])
@interact(pval=widgets.FloatSliderWidget(min=res.min(), max=res.max(), value=max(res.max()-2, res.min()), step=1))
def plot_entiched(pval):
pylab.rcParams['figure.figsize'] = (12.0, 8.0)
res[res > pval].order().plot(title=can + " Enriched Pathways vs Boneferroni t-test P-values (-log10)", kind='barh', rot=0)
print('Common pathways: %d, Patients: %d' %(len(common), sdf.shape[1]))
In [46]:
res.order(ascending=False).head(10)
Out[46]:
In [48]:
from pathway_analysis import preprocess_pathway_data
pdf = pd.DataFrame(preprocess_pathway_data().count(), columns=['Genes'])
x = pd.DataFrame(res.order(ascending=False).head(20), columns=['minus-log10-pval']).join(pdf, how='inner')
x.sort_index(by=['minus-log10-pval'], ascending=False)
Out[48]:
In [22]:
from pathway_analysis import preprocess_pathway_data
pdf = preprocess_pathway_data()
pdf.count().head()
Out[22]:
In [48]:
nsdf.ix['KEGG_MELANOMA'].plot(kind='hist', bins=50)
Out[48]:
In [50]:
sdf.ix['KEGG_MELANOMA'].plot(kind='hist', bins=50)