Are there specific genes in which a significant portion of the mutations fall? We want to answer this by finding the distribution of the number of mutations per gene. That is, for each integer, we want to know how many genes have that number of mutations.
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import optimize
sns.set()
We first map genes to the number of mutations they harbor (read from a random sample of 100,000 mutations)
In [2]:
from collections import Counter
from ICGC_data_parser import SSM_Reader
mutations_per_gene = Counter()
mutations = SSM_Reader(filename='/home/ad115/Downloads/simple_somatic_mutation.aggregated.vcf.gz')
# Fix weird bug due to malformed description headers
mutations.infos['studies'] = mutations.infos['studies']._replace(type='String')
consequences = mutations.subfield_parser('CONSEQUENCE')
for i, record in enumerate(mutations):
if i % 100000 == 0:
print(i)
affected_genes = [c.gene_symbol for c in consequences(record) if c.gene_affected]
mutations_per_gene.update(affected_genes)
mutations_per_gene.most_common(5)
Out[2]:
In [3]:
len(mutations_per_gene)
Out[3]:
Now we want to group by number of mutations
In [4]:
distribution = Counter(mutations_per_gene.values())
distribution.most_common(10)
Out[4]:
Now we plot the data...
In [5]:
x = sorted(distribution.keys())
y = [distribution[i] for i in x]
plt.figure(figsize=(10, 7))
plt.plot(x, y)
plt.yscale('log')
plt.xscale('log')
plt.title('Mutation distribution by gene')
plt.xlabel('$n$')
plt.ylabel('genes with $n$ mutations')
plt.show()
We can see the data resembles a power law but does not quite fit. It looks like it has a bump in the middle, this may be because the genes have wildly varying lengths. In order to correct this we have to normalize the mutations per gene by the length of the gene. This is done as follows:
In [6]:
# In order to find out the length of the
# genes, we will use the Ensembl REST API.
import ensembl_rest
from itertools import islice
def chunks_of(iterable, size=10):
"""A generator that yields chunks of fixed size from the iterable."""
iterator = iter(iterable)
while True:
next_ = list(islice(iterator, size))
if next_:
yield next_
else:
break
# ---
# Instantiate a client for communication with
# the Ensembl REST API.
client = ensembl_rest.EnsemblClient()
normalized_counts = dict()
lengths_distribution = Counter()
for i, gene_batch in enumerate(chunks_of(mutations_per_gene, size=1000)):
# Get information of the genes
gene_data = client.symbol_post('human',
params={'symbols': gene_batch})
gene_lengths = {gene: data['end'] - data['start'] + 1
for gene, data in gene_data.items()}
lengths_distribution.update(gene_lengths.values())
# Get the normalization
normalized_counts.update({
gene: mutations_per_gene[gene] / gene_lengths[gene]
for gene in gene_data
})
print((i+1)*1000)
In [7]:
c = Counter()
c.update(normalized_counts)
c.most_common(10)
Out[7]:
In [8]:
normalized_distribution = Counter(normalized_counts.values())
normalized_distribution.most_common(10)
Out[8]:
In [9]:
x = sorted(normalized_distribution.keys())
y = [normalized_distribution[i] for i in x]
plt.figure(figsize=(10, 7))
plt.plot(x, y)
plt.xscale('log')
plt.title('Mutations per base distribution by gene (normalized)')
plt.xlabel('$x$')
plt.ylabel('genes with $x$ mutations per base pair')
plt.show()
In [10]:
max(lengths_distribution)
Out[10]:
In [11]:
min(lengths_distribution)
Out[11]:
In [12]:
lengths_distribution.most_common(5)
Out[12]:
In [13]:
x = sorted(lengths_distribution.keys())
y = [lengths_distribution[i] for i in x]
plt.figure(figsize=(10, 7))
plt.plot(x, y)
plt.xscale('log')
plt.yscale('log')
plt.title('Gene lengths distribution')
plt.xlabel('$L$')
plt.ylabel('genes with length $L$')
plt.savefig('gene-lengths.png')
plt.show()
In [14]:
lengths_distribution
Out[14]:
In [ ]:
In [ ]: