In [1]:
import pandas as pd
import glob as glob
import matplotlib.pyplot as plt
from __future__ import division
from scipy import stats
%matplotlib inline
import numpy as np
import synapseclient
execfile('common_functions.py')
A major concern with releasing patient genomic data is that germline variants may be released alongside the reported somatic mutations, opening a potential opportunity for sample reidentification. We explore how many common variants are observed in our dataset, in aggregate and by center, versus TCGA. A simple two-tailed t-test was implemented by the scipy statistics package to compare the distribution of common variants in GENIE to TCGA.
Our filter to flag germline variants is defined as the following:
Both TCGA and GENIE mafs were annotated with VEP and VCF2MAF (https://github.com/mskcc/vcf2maf)
We hypothesize that GENIE contains significantly less common variants than TCGA when considered in aggregate and by center. This is likely heavily confounded by the considered sites of samples in TCGA versus GENIE as many GENIE centers utilized targeted panels tailored to their genomic coordinates of interest.
(Something about this analysis is address that we are presenting potential germline sites than TCGA on average, we will present similar analyses to reduce the germline variants)
In [2]:
df = pd.read_csv('/Users/brendan/Analysis/genie/Project Genie/data_mutations_extended.txt', sep = '\t',
low_memory = False, comment = '#')
desired_classes = ['.', 'PASS', 'R8', 'SB', 'LOWGQ']
df_keep = df[df['FILTER'].isin(desired_classes)]
df_reject = df[~df['FILTER'].isin(desired_classes)]
print ''
print 'GENIE consists of', str(len(df['Tumor_Sample_Barcode'].unique())), 'tumor samples containing', str(len(df)), 'variants.'
print ''
print 'Our filter suggests that there are', str(len(df_reject)),'common variants within GENIE,'
print 'constituting a germline contamination rate of', str((10405)/(10405+90993)*100) + '%'
In [3]:
plot_title = 'Common variants per sample, GENIE'
variants_per_sample = pd.DataFrame([], columns = ['sample', 'center', '# variants', '# common variants', '# uncommon variants'])
variants_per_sample.ix[:,'sample'] = df['Tumor_Sample_Barcode'].unique()
desired_classes = ['.', 'PASS', 'R8', 'SB', 'LOWGQ']
for sample_ in df['Tumor_Sample_Barcode'].unique():
index_ = variants_per_sample[variants_per_sample['sample'] == sample_].index.tolist()[0]
df_sample_ = df[df['Tumor_Sample_Barcode'] == sample_]
variants_per_sample.ix[index_, 'center'] = df_sample_['Center'].unique().tolist()[0]
variants_per_sample.ix[index_, '# variants'] = len(df_sample_)
idx_common_variants = df_sample_['FILTER'].isin(desired_classes)
variants_per_sample.ix[index_, '# common variants'] = len(df_sample_[~idx_common_variants])
variants_per_sample.ix[index_, '# uncommon variants'] = len(df_sample_[idx_common_variants])
print 'For GENIE samples...'
print '...mean number of variants:', variants_per_sample.ix[:,'# variants'].mean()
print '...mean number of common variants:', variants_per_sample.ix[:,'# common variants'].mean()
print '...std dev of common variants:', variants_per_sample.ix[:,'# common variants'].std()
print '...max number of common variants:', variants_per_sample.ix[:, '# common variants'].max()
print ''
frac = len(variants_per_sample[variants_per_sample['# common variants'] <= variants_per_sample.ix[:,'# common variants'].mean()])
frac = frac/len(variants_per_sample)*100
print str(frac) + '% of samples have less than the mean number of common variants'
In [4]:
plt.figure(figsize=(14, 4))
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
plt.hist(variants_per_sample['# common variants'], 40, facecolor = tableau20[0], alpha = 0.75)
plt.axis([0, 40, 0, 300])
plt.title('Number of common variants, GENIE', fontsize = 14)
plt.xlabel('Number of common variants')
plt.grid(True)
plt.show()
In [5]:
tcga_files = glob.glob('mafs/*')
cols = ['sample id', 'tumor type', '# variants', '# common variants', '# uncommon variants']
df_variants_tcga = pd.DataFrame([], columns = cols)
list_ = []
for file_ in tcga_files:
tumortype = file_.split('/')[1].split('_')[1]
print '--- Processing tumor type', str(tumortype) + ':', file_, '---'
df_tumortype = pd.read_csv(file_, sep = '\t', low_memory = False)
df_ = pd.DataFrame([], columns = cols)
df_.ix[:,'sample id'] = df_tumortype['Tumor_Sample_Barcode'].unique()
df_.ix[:,'tumor type'] = tumortype
print len(df_.ix[:,'sample id'].unique()), 'samples found for this tumor type with', str(len(df_)), 'variants in total'
print 'Extracting number of common variants per sample in this tumor type...'
for sample_ in df_tumortype['Tumor_Sample_Barcode'].unique():
index_ = df_[df_['sample id'] == sample_].index.tolist()[0]
df_sample_ = df_tumortype[df_tumortype['Tumor_Sample_Barcode'] == sample_]
df_.ix[index_, '# variants'] = len(df_sample_)
idx_common_variants = df_sample_['FILTER'].str.contains('common_variant') # Change if we change the filter
df_.ix[index_,'# common variants'] = len(df_sample_[idx_common_variants])
df_.ix[index_,'# uncommon variants'] = len(df_sample_[~idx_common_variants])
# Add piece to record progress along samples in a given tissue type
print '... all samples have now processed for this tumor type (' + tumortype + ')'
print ''
print ''
list_.append(df_)
df_variants_tcga = pd.concat(list_, ignore_index = True)
In [6]:
print 'For TCGA samples...'
print '...mean number of variants:', df_variants_tcga.ix[:,'# variants'].mean()
print '...mean number of common variants:', df_variants_tcga.ix[:,'# common variants'].mean()
print '...std dev of common variants:', df_variants_tcga.ix[:,'# common variants'].std()
print '...max number of common variants:', df_variants_tcga.ix[:, '# common variants'].max()
print ''
frac = len(df_variants_tcga[df_variants_tcga['# common variants'] <= df_variants_tcga.ix[:,'# common variants'].mean()])
frac = frac/len(df_variants_tcga)*100
print str(frac) + '% of samples have less than the mean number of common variants'
In [7]:
print 'On the scale of the GENIE figure'
plt.figure(figsize=(14, 4))
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
plt.hist(df_variants_tcga['# common variants'], bins = 400, facecolor = tableau20[0], alpha = 0.75)
plt.axis([0, 40, 0, 400])
plt.title('Number of common variants, TCGA', fontsize = 14)
plt.xlabel('Number of common variants')
plt.grid(True)
plt.show()
In [8]:
print 'Number of common variants 0 -> 60'
plt.figure(figsize=(14, 4))
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
plt.hist(df_variants_tcga['# common variants'], bins = 400, facecolor = tableau20[0], alpha = 0.75)
plt.axis([0, 40, 0, 4000])
plt.title('Number of common variants, TCGA', fontsize = 14)
plt.xlabel('Number of common variants')
plt.grid(True)
plt.show()
In [9]:
print 'Number of common variants 60 -> 850'
plt.figure(figsize=(14, 4))
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
plt.hist(df_variants_tcga['# common variants'], bins = 120, facecolor = tableau20[0], alpha = 0.75)
plt.axis([60, 850, 0, 50])
plt.title('Number of common variants, TCGA', fontsize = 14)
plt.xlabel('Number of common variants')
plt.grid(True)
plt.show()
In [10]:
variants_genie = variants_per_sample
variants_tcga = df_variants_tcga
common_variants_genie = variants_genie['# common variants']
common_variants_tcga = variants_tcga['# common variants']
print 'We compare the # of common variants to TCGA by center'
print ''
centers = ['VICC', 'MSK', 'GRCC', 'JHH', 'UHN', 'NKI', 'DFCI', 'MDA']
for center_ in centers:
print '----', center_, '----'
df_center = variants_genie[variants_genie['center'] == center_]
print 'mean:', df_center['# common variants'].mean()
print 'std dev:', df_center['# common variants'].std()
print 'Against TCGA:'
statistic, pvalue = stats.ttest_ind(common_variants_tcga, df_center['# common variants'], equal_var = False)
print '...test statistic:', statistic
print '...p-value:', pvalue
print ''
print ''
print 'What about the entire data sets? GENIE v TCGA'
print ''
print 'GENIE mean:', common_variants_genie.mean(), '; GENIE std dev:', common_variants_genie.std()
print 'TCGA mean:', common_variants_tcga.mean(), '; TCGA std dev:', common_variants_tcga.std()
statistic, pvalue = stats.ttest_ind(common_variants_tcga, common_variants_genie, equal_var = False)
print '...test statistic:', statistic
print '...p-value:', pvalue
print ''
In [11]:
variants_genie = variants_per_sample
variants_tcga = df_variants_tcga
common_variants_genie = variants_genie['# common variants']
common_variants_tcga = variants_tcga['# common variants']
centers = ['VICC', 'MSK', 'GRCC', 'JHH', 'UHN', 'NKI', 'DFCI', 'MDA']
for center_ in centers:
print '----', center_, '----'
df_center = variants_genie[variants_genie['center'] == center_]
print 'mean:', df_center['# common variants'].mean()
print 'std dev:', df_center['# common variants'].std()
print 'Against TCGA:'
statistic, pvalue = stats.ttest_ind(common_variants_tcga, df_center['# common variants'], equal_var = False)
print '...test statistic:', statistic
print '...p-value:', pvalue
print ''
plt.figure(figsize=(14, 4))
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
plt.hist(df_center['# common variants'], facecolor = tableau20[0], alpha = 0.75)
#plt.axis([0, 40, 0, df_center['# common variants'].mean()*2])
plt.title('Number of common variants, ' + center_, fontsize = 14)
plt.xlabel('Number of common variants')
plt.grid(True)
plt.show()
In [12]:
print 'How many samples in GENIE contain more common variants than the mean # common variants in TCGA samples?'
print ''
print 'To reiterate, TCGA samples on average have', str(common_variants_tcga.mean()), 'common variants'
top_genie = variants_genie[variants_genie['# common variants'] > common_variants_tcga.mean()]
print len(top_genie['sample'].unique()), '/', str(len(variants_genie['sample'].unique())), \
'(' + str(len(top_genie['sample'].unique()) / len(variants_genie['sample'].unique()) *100) + '%)', \
'samples in GENIE have more common variants than this'
plt.figure(figsize=(14, 4))
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
plt.hist(top_genie['# common variants'], facecolor = tableau20[0], alpha = 0.75)
plt.title('Common variants in highest risk GENIE samples (> 12.4 common variants)', fontsize = 14)
plt.xlabel('Number of common variants')
plt.grid(True)
plt.show()
Return to top
We have shown that GENIE contains many less common variants than TCGA, an already publicly available dataset, and believe that the GENIE dataset is at low risk for potential reidentification for any sample.
In [ ]: