In [ ]:
import gzip

#filename_summary = '../../data/COSMIC+Cancer/CosmicCLP_MutantExport.summary.gz'
#filename_out = '../../data/COSMIC+Cancer/CosmicCLP_MutantExport.sample_freq'
#data_type = 'Cell Lines'

#filename_summary = '../../data/COSMIC+Cancer/CosmicMutantExport.summary.gz'
#filename_out = '../../data/COSMIC+Cancer/CosmicMutantExport.sample_freq'
#data_type = 'Cancer Samples'

#h_sample_name = 'Sample name'
#h_mut_type = 'Mutation Description'

filename_summary = '../../data/COSMIC+Cancer/Kandoth2013_Nature_STable2.txt'
filename_out = '../../data/COSMIC+Cancer/Kandoth2013_Nature_STable2.sample_freq'
data_type = 'Cancer Samples'

h_sample_name = 'Tumor_Sample'
h_mut_type = 'Variant_Classification'

In [ ]:
mut_freq = dict()

f_summary = open(filename_summary,'r')
if filename_summary.endswith('.gz'):
    f_summary = gzip.open(filename_summary,'rt')
    
headers = f_summary.readline().strip().split("\t")
print(headers)

idx_sample_name = headers.index(h_sample_name)
idx_mut_desc = headers.index(h_mut_type)

for line in f_summary:
    #tokens = line.strip().split("\t")
    tokens = line.split("\t")
    
    tmp_sample_name = tokens[idx_sample_name]
    
    if not tmp_sample_name in mut_freq:
        mut_freq[tmp_sample_name] = {'subs':0, 'indel':0, 'others':0}
    
    tmp_mut_desc = tokens[idx_mut_desc]
    if tmp_mut_desc.find('Frame') >= 0:
    #if tmp_mut_desc.startswith('Insertion') or tmp_mut_desc.startswith('Deletion'):
        mut_freq[tmp_sample_name]['indel'] += 1
    #elif tmp_mut_desc.startswith('Substitution'):
    #    mut_freq[tmp_sample_name]['subs'] += 1
    else:
        mut_freq[tmp_sample_name]['others'] += 1
f_summary.close()

In [ ]:
indel_count_list = []
mut_count_list = []

count_small_mut = 0

f_out = open(filename_out,'w')
f_out.write('#SampleName\tTotalMutation\tSubstitutions\Indels\tOthers\n')
for tmp_s in sorted(mut_freq.keys()):
    count_total = sum(mut_freq[tmp_s].values())
    if count_total < 10:
        count_small_mut += 1
        continue
        
    count_subs = mut_freq[tmp_s]['subs']
    count_indel = mut_freq[tmp_s]['indel']
    count_others = mut_freq[tmp_s]['others']
    f_out.write("%s\t%d\t%d\t%d\t%d\n"%(tmp_s, count_total, count_subs, count_indel, count_others))

    mut_count_list.append(count_total)
    indel_count_list.append(count_indel)    
f_out.close()

count_cell_lines = len(mut_freq.keys()) - count_small_mut
mean_indel_count = sum(indel_count_list)/count_cell_lines
median_indel_count = sorted(indel_count_list)[int(count_cell_lines*0.5)]

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(1,1,1)
ax1.hist([min(x,100) for x in indel_count_list], bins=100)
#ax1.hist([min(x,100) for x in mut_count_list], bins=100)
ax1.set_ylabel('Number of cell lines')
ax1.set_xlabel('Number of Exonic Indels')
ax1.grid()
ax1.set_title('Number of Indels in %d %s (Mean: %.1f, Median:%d)'\
              %(count_cell_lines, data_type, mean_indel_count, median_indel_count))
plt.show()

count_gt10_indels = len([x for x in indel_count_list if x > 10])
pct_gt10_indels = count_gt10_indels/count_cell_lines*100

print("Samples with < 10 mutations: %d"%count_small_mut)
print(">10 indels: %d (%.2f pct)"%(count_gt10_indels, pct_gt10_indels))

In [ ]: