Not all mutations are the same and here we analyze how are the common mutation consequences distributed,
In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
We first map genes to the number of mutations they harbor (read from a random sample of 100,000 mutations)
In [9]:
from collections import Counter
from ICGC_data_parser import SSM_Reader
counter = Counter()
# Open the mutations file
mutations = SSM_Reader(filename='data/ssm_sample.vcf')
consequences = mutations.subfield_parser('CONSEQUENCE')
for record in mutations:
consequence_types = [c.consequence_type for c in consequences(record)]
counter.update(consequence_types)
total = sum(counter.values())
for consequence_type,n in counter.most_common():
print(f'{n/total :<10.3%} : {consequence_type}')
Out[9]: