In [1]:
import NotebookImport
from TCGA_analysis_PanCancer_import import *
from IPython.display import HTML
In [13]:
# 1) check lengths of segments having copynumber > 4 in known focal amplification genes (Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers", Supp Table2 Known Targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
focal_sizes=list()
nonfocal_sizes = list()
for sample in samples.values():
for gene in beroukhim_genes:
if gene in gene_positions.keys():
log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and copynumber > 4:
focal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
if copynumber != "n/a" and copynumber < 4:
nonfocal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
fig, axs = plt.subplots(1,2,figsize=(12,3))
focal_sizes_series = pandas.Series(focal_sizes)
fig_focal=focal_sizes_series.plot(ax=axs[0],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[0].vlines(x=20000000, ymin=0, ymax=2000, color='r')
axs[0].set_xlabel("Size of target genes at copynumber > 4")
#output=fig_focal.get_figure()
#plt.show()
#output.savefig('BRCA_sizes_copynumber_over4_known.png')
mean_focal = focal_sizes_series.mean()
std_focal = focal_sizes_series.std()
nonfocal_sizes_series = pandas.Series(nonfocal_sizes)
fig_nonfocal=nonfocal_sizes_series.plot(ax=axs[1],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[1].vlines(x=20000000, ymin=0, ymax=10000, color='r')
axs[1].set_xlabel("Size of target genes at copynumber < 4")
#output1=fig_nonfocal.get_figure()
#plt.show()
#output1.savefig('BRCA_sizes_copynumber_lower4_known.png')
mean_nonfocal = nonfocal_sizes_series.mean()
std_nonfocal = nonfocal_sizes_series.std()
print ""
print " 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications"
print " Segments copynumber > 4 and known focal amplifications:"
print " Mean: "+str(mean_focal)
print " Standard Deviation: "+str(std_focal)
print " ---------------------------------"
print " Segments copynumber < 4:"
print " Mean: "+str(mean_nonfocal)
print " Standard Deviation: "+str(std_nonfocal)
print " ---------------------------------"
In [14]:
# 2) correlation of lengths and gene expression of MYC
MYC_expression = list()
MYC_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
if sample.getRNASeqFromGene("MYC") != "n/a" and size != None:
MYC_expression.append(sample.getRNASeqFromGene("MYC"))
MYC_size.append(size)
MYC_expr_tuple={'MYC expression': MYC_expression,'MYC size': MYC_size}
MYC_expr_size=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_size.plot(kind='scatter', x='MYC expression', y='MYC size')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_size)
print ""
print " 2) correlation of lengths and gene expression of MYC"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [15]:
# 3) correlation of lengths and gene expression of CCND1
CCND1_expression = list()
CCND1_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
if sample.getRNASeqFromGene("CCND1") != "n/a" and size != None:
CCND1_expression.append(sample.getRNASeqFromGene("CCND1"))
CCND1_size.append(size)
CCND1_expr_tuple={'CCND1 expression': CCND1_expression,'CCND1 size': CCND1_size}
CCND1_expr_size=pandas.DataFrame(CCND1_expr_tuple)
fig_correlation= CCND1_expr_size.plot(kind='scatter', x='CCND1 expression', y='CCND1 size')
plt.hlines(y=20000000, xmin=-50000, xmax=250000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_expression,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_expression,CCND1_size)
print ""
print " 3) correlation of lengths and gene expression of CCND1"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [16]:
# 4) correlation of lengths and gene expression of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_expression = list()
gene_size = list()
for sample in samples.values():
for gene in beroukhim_genes:
size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
if sample.getRNASeqFromGene(gene) != "n/a" and size != None:
gene_expression.append(sample.getRNASeqFromGene(gene))
gene_size.append(size)
gene_expr_tuple={'Gene expression': gene_expression,'Gene size': gene_size}
gene_expr_size=pandas.DataFrame(gene_expr_tuple)
fig_correlation= gene_expr_size.plot(kind='scatter', x='Gene expression', y='Gene size')
plt.hlines(y=20000000, xmin=-50000, xmax=400000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(gene_expression,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_expression,gene_size)
print ""
print " 4) correlation of lengths and gene expression of Beroukhim targets"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [17]:
# 5) correlation of lengths and copynumber of MYC
MYC_copynumber = list()
MYC_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
MYC_copynumber.append(copynumber)
MYC_size.append(size)
MYC_copynumber_tuple={'MYC copynumber': MYC_copynumber,'MYC size': MYC_size}
MYC_copynumber_size=pandas.DataFrame(MYC_copynumber_tuple)
fig_correlation= MYC_copynumber_size.plot(kind='scatter', x='MYC copynumber', y='MYC size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_copynumber,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_copynumber,MYC_size)
print ""
print " 5) correlation of lengths and copynumber of MYC"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [18]:
# 6) correlation of lengths and copynumber of CCND1
CCND1_copynumber = list()
CCND1_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
log2 = sample.getLog2FromGene(gene_positions['CCND1'].chrom,gene_positions['CCND1'].start, gene_positions['CCND1'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
CCND1_copynumber.append(copynumber)
CCND1_size.append(size)
CCND1_copynumber_tuple={'CCND1 copynumber': CCND1_copynumber,'CCND1 size': CCND1_size}
CCND1_copynumber_size=pandas.DataFrame(CCND1_copynumber_tuple)
fig_correlation= CCND1_copynumber_size.plot(kind='scatter', x='CCND1 copynumber', y='CCND1 size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_copynumber,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_copynumber,CCND1_size)
print ""
print " 6) correlation of lengths and copynumber of CCND1"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [20]:
# 7) correlation of lengths and copynumber of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_copynumber = list()
gene_size = list()
for sample in samples.values():
for gene in beroukhim_genes:
size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
gene_copynumber.append(copynumber)
gene_size.append(size)
gene_copynumber_tuple={'Gene copynumber': gene_copynumber,'Gene size': gene_size}
gene_copynumber_size=pandas.DataFrame(gene_copynumber_tuple)
fig_correlation= gene_copynumber_size.plot(kind='scatter', x='Gene copynumber', y='Gene size')
plt.hlines(y=20000000, xmin=0, xmax=50, color='r')
plt.vlines(x=4, ymin=0, ymax=250000000, color='g')
output1=fig_correlation.get_figure()
#plt.show()
output1.savefig('PanCancer_copynumber_vs_size_beroukhim_targets.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(gene_copynumber,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_copynumber,gene_size)
print ""
print " 7) correlation of lengths and copynumber of Beroukhim targets"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [4]:
# 8) correlation of lengths and gene expression of MYC
MYC_expression = list()
MYC_copynumber = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if sample.getRNASeqFromGene("MYC") != "n/a" and copynumber != "n/a":
MYC_expression.append(sample.getRNASeqFromGene("MYC"))
MYC_copynumber.append(size)
MYC_expr_tuple={'MYC expression': MYC_expression,'MYC copynumber': MYC_copynumber}
MYC_expr_copynumber=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_copynumber.plot(kind='scatter', x='MYC expression', y='MYC copynumber')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_copynumber)
print ""
print " 8) correlation of copynumber and gene expression of MYC"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [20]:
# 7) Check occurrence of focal amplifications of Beroukhim targets
beroukhim_genes={"MYC" : 0,"CCND1" : 0,"ERBB2" : 0,"CDK4" : 0,"NKX2-1" : 0,"MDM2" : 0,"EGFR" : 0,"MCL1" : 0,"FGFR1" : 0,"KRAS" : 0,"CCNE1" : 0,"CRKL" : 0,"HMGA2" : 0,"TERT" : 0,"PRKCI" : 0,"IGF1R" : 0,"MYCL" : 0,"MYCN" : 0,"CDK6" : 0,"BCL2L1" : 0,"MYB" : 0,"MET" : 0,"JUN" : 0,"BIRC2" : 0,"YAP1" : 0,"PDGFRA" : 0,"KIT" : 0,"PIK3CA" : 0,"MDM4" : 0,"AR" : 0}
for sample in samples.values():
for gene in beroukhim_genes.keys():
if sample.checkFocalGeneAmp(gene):
beroukhim_genes[gene] += 1
focal_amp_series=pandas.Series(beroukhim_genes)
focal_amp_series.sort(ascending=False)
fig_correlation= focal_amp_series.plot(kind='bar')
print ""
print " 7) Check occurrence of focal amplifications of Beroukhim targets"
print ""
In [21]:
# 8) Get 50 most frequent genes in focal events
gene_amp_frequency = dict()
for sample in samples.values():
for gene in sample.listFocalAmpGenes():
if gene in gene_amp_frequency.keys():
gene_amp_frequency[gene] += 1
else:
gene_amp_frequency[gene] = 1
gene_del_frequency = dict()
for sample in samples.values():
for gene in sample.listFocalDelGenes():
if gene in gene_del_frequency.keys():
gene_del_frequency[gene] += 1
else:
gene_del_frequency[gene] = 1
print ""
print " 8) Get 20 most frequent genes in focal amplifications/deletions"
print " Amplifications:"
count = 0
for gene in sorted(gene_amp_frequency, key=gene_amp_frequency.get, reverse=True):
count += 1
if (gene in gene_positions.keys()):
print "Gene: "+gene+" Freq: "+str(gene_amp_frequency[gene])+" Pos: "+gene_positions[gene].returnPos()
else:
print "Gene: "+gene+" Freq: "+str(gene_amp_frequency[gene])
if (count > 19):
break
count = 0
print ""
print " Deletions:"
for gene in sorted(gene_del_frequency, key=gene_del_frequency.get, reverse=True):
count += 1
if (gene in gene_positions.keys()):
print "Gene: "+gene+" Freq: "+str(gene_del_frequency[gene])+" Pos: "+gene_positions[gene].returnPos()
else:
print "Gene: "+gene+" Freq: "+str(gene_del_frequency[gene])
if (count > 19):
break
Numbers in diagonal of the table represent all gene amplifications found
In [22]:
# 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0
for sample in samples.values():
if sample.checkFocalGeneAmp("MYC"):
MYC_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
MYC_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("FGFR1"):
MYC_and_FGFR1_count += 1
if sample.checkFocalGeneAmp("CCND1"):
MYC_and_CCND1_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
MYC_and_ERBB2_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
MYC_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("FGFR1"):
FGFR1_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
FGFR1_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("CCND1"):
FGFR1_and_CCND1_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
FGFR1_and_ERBB2_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
FGFR1_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("CCND1"):
CCND1_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
CCND1_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
CCND1_and_ERBB2_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
CCND1_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
ERBB2_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
ERBB2_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
ERBB2_and_CCNE1_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
CDKN2A_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
CDKN2A_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
CCNE1_count += 1
print ""
print " 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print " MYC amplified: "+str(MYC_count)
print " CDKNA2 deleted: "+str(CDKN2A_count)
print " FGFR1 amplified: "+str(FGFR1_count)
print " CCND1 amplified: "+str(CCND1_count)
print " ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h
Out[22]:
In [23]:
# 10) Check for cooccurence of amplifications (copynumber > 4) MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0
for sample in samples.values():
log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC = "n/a"
log2_FGFR1 = sample.getLog2FromGene(gene_positions["FGFR1"].chrom,gene_positions["FGFR1"].start, gene_positions["FGFR1"].end)
if (log2_FGFR1 != None):
copynumber_FGFR1 = 2.0 * 2.0 ** log2_FGFR1
else:
copynumber_FGFR1 = "n/a"
log2_CDKN2A = sample.getLog2FromGene(gene_positions["CDKN2A"].chrom,gene_positions["CDKN2A"].start, gene_positions["CDKN2A"].end)
log2_CCND1 = sample.getLog2FromGene(gene_positions["CCND1"].chrom,gene_positions["CCND1"].start, gene_positions["CCND1"].end)
if (log2_CCND1 != None):
copynumber_CCND1 = 2.0 * 2.0 ** log2_CCND1
else:
copynumber_CCND1 = "n/a"
log2_ERBB2 = sample.getLog2FromGene(gene_positions["ERBB2"].chrom,gene_positions["ERBB2"].start, gene_positions["ERBB2"].end)
if (log2_ERBB2 != None):
copynumber_ERBB2 = 2.0 * 2.0 ** log2_ERBB2
else:
copynumber_ERBB2 = "n/a"
log2_CCNE1 = sample.getLog2FromGene(gene_positions["CCNE1"].chrom,gene_positions["CCNE1"].start, gene_positions["CCNE1"].end)
if (log2_CCNE1 != None):
copynumber_CCNE1 = 2.0 * 2.0 ** log2_CCNE1
else:
copynumber_CCNE1 = "n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 4:
MYC_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
MYC_and_CDKN2A_count += 1
if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
MYC_and_FGFR1_count += 1
if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
MYC_and_CCND1_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
MYC_and_ERBB2_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
MYC_and_CCNE1_count += 1
if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
FGFR1_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
FGFR1_and_CDKN2A_count += 1
if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
FGFR1_and_CCND1_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
FGFR1_and_ERBB2_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
FGFR1_and_CCNE1_count += 1
if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
CCND1_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
CCND1_and_CDKN2A_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
CCND1_and_ERBB2_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
CCND1_and_CCNE1_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
ERBB2_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
ERBB2_and_CDKN2A_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
ERBB2_and_CCNE1_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
CDKN2A_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
CDKN2A_and_CCNE1_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
CCNE1_count += 1
print ""
print " 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print " MYC amplified: "+str(MYC_count)
print " CDKNA2 deleted: "+str(CDKN2A_count)
print " FGFR1 amplified: "+str(FGFR1_count)
print " CCND1 amplified: "+str(CCND1_count)
print " ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h
Out[23]:
Leiserson: Log2-ratio > 0.9
Amplitude: copynumber > 4 (means: log2-ratio > 1)
Size: smaller than 20Mbp, Log2-ratio > 0.2; log2-ratio > 0.2 higher than surrounding 20Mbp on either side
In [ ]:
size = list()
amplitude = list()
leiserson = list()
for sample in samples.values():
if sample.checkFocalGeneAmp("MYC"):
size.append(sample.id)
log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC = "n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 4:
amplitude.append(sample.id)
if log2_MYC > 0.9:
leiserson.append(sample.id)
print "MYC amplification definition"
print "----------------------------"
print "MYC amplified defined by size and local amplitude: "+str(len(size))
print "MYC amplified defined by amplitude (Copynumber > 4): "+str(len(amplitude))
print "MYC amplified defined by Leiserson (Log2-Ratio > 0.9): "+str(len(leiserson))
plt.figure(figsize=(10,10))
matplotlib_venn.venn3([set(size),set(amplitude),set(leiserson)],set_labels=["Size", "Copynumber", "Leiserson"])
In [3]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
if sample.checkFocalGeneAmp("MYC"):
source_sites.append(sample.primarysiteofdesease)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_focal_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[3]:
In [13]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
copynumber_MYC = "n/a"
log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC="n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 4:
source_sites.append(sample.primarysiteofdesease)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn4_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[13]:
In [9]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
if not sample.somatic_mutation_data:
continue
copynumber_MYC = "n/a"
log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC="n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
source_sites.append(sample.primarysiteofdesease)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[9]:
In [8]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
if not sample.somatic_mutation_data:
continue
if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
source_sites.append(sample.primarysiteofdesease)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[8]:
In [7]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
copynumber_MYC = "n/a"
log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC="n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 6:
source_sites.append(sample.primarysiteofdesease)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn6_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[7]:
In [6]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
if not sample.somatic_mutation_data:
continue
copynumber_MYC = "n/a"
log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC="n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_histological_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[6]:
In [5]:
source_sites = list()
for sample in samples.values():
if not sample.clinical:
continue
if not sample.somatic_mutation_data:
continue
if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_histological_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
count += 1
s += "</table>"
h = HTML(s);h
Out[5]:
In [ ]: