Get statistics for focal amplifications of Pan cancer samples



In [1]:

    
import NotebookImport
from TCGA_analysis_PanCancer_import import *
from IPython.display import HTML









    



importing IPython notebook from TCGA_analysis_PanCancer_import.ipynb
Loading Focal Amplification data
Loading RNASeq data
Loading RNASeq data of controls
Loading CNV Data
Loading Somatic Mutation Data
116 samples have somatic but no CNV data
Loading Clinical Data
Samples: 5737
  --Focal Data: 5737
  --CNV:        5737
  --RNASeq:     4196
  --Somatic:    3297
  --Clinical:   5312

Check lengths of segments having copynumber > 4 at significant targets found in Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers"



In [13]:

    
# 1) check lengths of segments having copynumber > 4 in known focal amplification genes (Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers", Supp Table2 Known Targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]

focal_sizes=list()
nonfocal_sizes = list()

for sample in samples.values():
   for gene in beroukhim_genes:
      if gene in gene_positions.keys():
         log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
         if (log2 != None):
            copynumber = 2.0 * 2.0 ** log2
         else:
            copynumber = "n/a"
         if copynumber != "n/a" and copynumber > 4:
            focal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
         if copynumber != "n/a" and copynumber < 4:
            nonfocal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))

fig, axs = plt.subplots(1,2,figsize=(12,3))
focal_sizes_series = pandas.Series(focal_sizes)
fig_focal=focal_sizes_series.plot(ax=axs[0],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[0].vlines(x=20000000, ymin=0, ymax=2000, color='r')
axs[0].set_xlabel("Size of target genes at copynumber > 4")
#output=fig_focal.get_figure()
#plt.show()
#output.savefig('BRCA_sizes_copynumber_over4_known.png')
mean_focal = focal_sizes_series.mean()
std_focal = focal_sizes_series.std()

nonfocal_sizes_series = pandas.Series(nonfocal_sizes)
fig_nonfocal=nonfocal_sizes_series.plot(ax=axs[1],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[1].vlines(x=20000000, ymin=0, ymax=10000, color='r')
axs[1].set_xlabel("Size of target genes at copynumber < 4")

#output1=fig_nonfocal.get_figure()
#plt.show()
#output1.savefig('BRCA_sizes_copynumber_lower4_known.png')
mean_nonfocal = nonfocal_sizes_series.mean()
std_nonfocal = nonfocal_sizes_series.std()

print ""
print " 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications"
print "   Segments copynumber > 4 and known focal amplifications:"
print "   Mean: "+str(mean_focal)
print "   Standard Deviation: "+str(std_focal)
print "   ---------------------------------"
print "   Segments copynumber < 4:"
print "   Mean: "+str(mean_nonfocal)
print "   Standard Deviation: "+str(std_nonfocal)
print "   ---------------------------------"









    



 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications
   Segments copynumber > 4 and known focal amplifications:
   Mean: 8149906.74884
   Standard Deviation: 14843540.2503
   ---------------------------------
   Segments copynumber < 4:
   Mean: 81214286.6467
   Standard Deviation: 58351966.4021
   ---------------------------------

Correlation of lengths and gene expression of MYC



In [14]:

    
# 2) correlation of lengths and gene expression of MYC

MYC_expression = list()
MYC_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
 if sample.getRNASeqFromGene("MYC") != "n/a" and size != None:
    MYC_expression.append(sample.getRNASeqFromGene("MYC"))
    MYC_size.append(size)

MYC_expr_tuple={'MYC expression': MYC_expression,'MYC size': MYC_size}
MYC_expr_size=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_size.plot(kind='scatter', x='MYC expression', y='MYC size')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_size)


print ""
print " 2) correlation of lengths and gene expression of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 2) correlation of lengths and gene expression of MYC
   Pearson Correlation coefficient: -0.149072168965
   P-value: 2.95643539727e-22
   -----------------------
   Spearman Correlation coefficient: -0.144418122635
   P-value: 5.67654647849e-21

Correlation of lengths and gene expression of CCND1



In [15]:

    
# 3) correlation of lengths and gene expression of CCND1

CCND1_expression = list()
CCND1_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 if sample.getRNASeqFromGene("CCND1") != "n/a" and size != None:
    CCND1_expression.append(sample.getRNASeqFromGene("CCND1"))
    CCND1_size.append(size)

CCND1_expr_tuple={'CCND1 expression': CCND1_expression,'CCND1 size': CCND1_size}
CCND1_expr_size=pandas.DataFrame(CCND1_expr_tuple)
fig_correlation= CCND1_expr_size.plot(kind='scatter', x='CCND1 expression', y='CCND1 size')
plt.hlines(y=20000000, xmin=-50000, xmax=250000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_expression,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_expression,CCND1_size)


print ""
print " 3) correlation of lengths and gene expression of CCND1"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 3) correlation of lengths and gene expression of CCND1
   Pearson Correlation coefficient: -0.149131685767
   P-value: 3.08056668406e-22
   -----------------------
   Spearman Correlation coefficient: -0.0756853726216
   P-value: 9.52583731327e-07

Correlation of lengths and gene expression of Beroukhim targets



In [16]:

    
# 4) correlation of lengths and gene expression of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_expression = list()
gene_size = list()

for sample in samples.values():
   for gene in beroukhim_genes:
      size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
      if sample.getRNASeqFromGene(gene) != "n/a" and size != None:
         gene_expression.append(sample.getRNASeqFromGene(gene))
         gene_size.append(size)

gene_expr_tuple={'Gene expression': gene_expression,'Gene size': gene_size}
gene_expr_size=pandas.DataFrame(gene_expr_tuple)
fig_correlation= gene_expr_size.plot(kind='scatter', x='Gene expression', y='Gene size')
plt.hlines(y=20000000, xmin=-50000, xmax=400000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(gene_expression,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_expression,gene_size)


print ""
print " 4) correlation of lengths and gene expression of Beroukhim targets"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 4) correlation of lengths and gene expression of Beroukhim targets
   Pearson Correlation coefficient: -0.0408145421637
   P-value: 1.30095602624e-45
   -----------------------
   Spearman Correlation coefficient: -0.062715299126
   P-value: 2.63416930919e-105

Correlation of lengths and copynumber of MYC



In [17]:

    
# 5) correlation of lengths and copynumber of MYC

MYC_copynumber = list()
MYC_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
 log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
 if (log2 != None):
    copynumber = 2.0 * 2.0 ** log2
 else:
    copynumber = "n/a"
 if copynumber != "n/a" and size != None:
    MYC_copynumber.append(copynumber)
    MYC_size.append(size)

MYC_copynumber_tuple={'MYC copynumber': MYC_copynumber,'MYC size': MYC_size}
MYC_copynumber_size=pandas.DataFrame(MYC_copynumber_tuple)
fig_correlation= MYC_copynumber_size.plot(kind='scatter', x='MYC copynumber', y='MYC size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_copynumber,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_copynumber,MYC_size)

print ""
print " 5) correlation of lengths and copynumber of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 5) correlation of lengths and copynumber of MYC
   Pearson Correlation coefficient: -0.379700649899
   P-value: 6.26746645327e-196
   -----------------------
   Spearman Correlation coefficient: -0.489832928527
   P-value: 0.0

Correlation of lengths and copynumber of CCND1



In [18]:

    
# 6) correlation of lengths and copynumber of CCND1

CCND1_copynumber = list()
CCND1_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 log2 = sample.getLog2FromGene(gene_positions['CCND1'].chrom,gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 if (log2 != None):
    copynumber = 2.0 * 2.0 ** log2
 else:
    copynumber = "n/a"
 if copynumber != "n/a" and size != None:
    CCND1_copynumber.append(copynumber)
    CCND1_size.append(size)

CCND1_copynumber_tuple={'CCND1 copynumber': CCND1_copynumber,'CCND1 size': CCND1_size}
CCND1_copynumber_size=pandas.DataFrame(CCND1_copynumber_tuple)
fig_correlation= CCND1_copynumber_size.plot(kind='scatter', x='CCND1 copynumber', y='CCND1 size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_copynumber,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_copynumber,CCND1_size)

print ""
print " 6) correlation of lengths and copynumber of CCND1"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 6) correlation of lengths and copynumber of CCND1
   Pearson Correlation coefficient: -0.376256824721
   P-value: 6.98901466695e-192
   -----------------------
   Spearman Correlation coefficient: -0.397137348017
   P-value: 1.70716829422e-215

Correlation of lengths and copynumber of Beroukhim targets



In [20]:

    
# 7) correlation of lengths and copynumber of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_copynumber = list()
gene_size = list()

for sample in samples.values():
   for gene in beroukhim_genes:
      size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
      log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
      if (log2 != None):
         copynumber = 2.0 * 2.0 ** log2
      else:
         copynumber = "n/a"
      if copynumber != "n/a" and size != None:
         gene_copynumber.append(copynumber)
         gene_size.append(size)

gene_copynumber_tuple={'Gene copynumber': gene_copynumber,'Gene size': gene_size}
gene_copynumber_size=pandas.DataFrame(gene_copynumber_tuple)
fig_correlation= gene_copynumber_size.plot(kind='scatter', x='Gene copynumber', y='Gene size')
plt.hlines(y=20000000, xmin=0, xmax=50, color='r')
plt.vlines(x=4, ymin=0, ymax=250000000, color='g')

output1=fig_correlation.get_figure()
#plt.show()
output1.savefig('PanCancer_copynumber_vs_size_beroukhim_targets.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(gene_copynumber,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_copynumber,gene_size)


print ""
print " 7) correlation of lengths and copynumber of Beroukhim targets"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 7) correlation of lengths and copynumber of Beroukhim targets
   Pearson Correlation coefficient: -0.167305791851
   P-value: 0.0
   -----------------------
   Spearman Correlation coefficient: -0.172442408185
   P-value: 0.0

Correlation of copynumber and gene expression of MYC



In [4]:

    
# 8) correlation of lengths and gene expression of MYC

MYC_expression = list()
MYC_copynumber = list()

for sample in samples.values():
   size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
   log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
   if (log2 != None):
      copynumber = 2.0 * 2.0 ** log2
   else:
      copynumber = "n/a"
   if sample.getRNASeqFromGene("MYC") != "n/a" and copynumber != "n/a":
      MYC_expression.append(sample.getRNASeqFromGene("MYC"))
      MYC_copynumber.append(size)

MYC_expr_tuple={'MYC expression': MYC_expression,'MYC copynumber': MYC_copynumber}
MYC_expr_copynumber=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_copynumber.plot(kind='scatter', x='MYC expression', y='MYC copynumber')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_copynumber)


print ""
print " 8) correlation of copynumber and gene expression of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)









    



 8) correlation of copynumber and gene expression of MYC
   Pearson Correlation coefficient: -0.149072168965
   P-value: 2.95643539727e-22
   -----------------------
   Spearman Correlation coefficient: -0.144418122635
   P-value: 5.67654647849e-21

Check occurrence of focal amplifications of Beroukhim targets



In [20]:

    
# 7) Check occurrence of focal amplifications of Beroukhim targets
beroukhim_genes={"MYC" : 0,"CCND1" : 0,"ERBB2" : 0,"CDK4" : 0,"NKX2-1" : 0,"MDM2" : 0,"EGFR" : 0,"MCL1" : 0,"FGFR1" : 0,"KRAS" : 0,"CCNE1" : 0,"CRKL" : 0,"HMGA2" : 0,"TERT" : 0,"PRKCI" : 0,"IGF1R" : 0,"MYCL" : 0,"MYCN" : 0,"CDK6" : 0,"BCL2L1" : 0,"MYB" : 0,"MET" : 0,"JUN" : 0,"BIRC2" : 0,"YAP1" : 0,"PDGFRA" : 0,"KIT" : 0,"PIK3CA" : 0,"MDM4" : 0,"AR" : 0}

for sample in samples.values():
   for gene in beroukhim_genes.keys():
      if sample.checkFocalGeneAmp(gene):
         beroukhim_genes[gene] += 1   

focal_amp_series=pandas.Series(beroukhim_genes)
focal_amp_series.sort(ascending=False)
fig_correlation= focal_amp_series.plot(kind='bar')

print ""
print " 7) Check occurrence of focal amplifications of Beroukhim targets"
print ""









    



 7) Check occurrence of focal amplifications of Beroukhim targets

Get 20 most frequent genes in focal events



In [21]:

    
# 8) Get 50 most frequent genes in focal events
gene_amp_frequency = dict()
for sample in samples.values():
   for gene in sample.listFocalAmpGenes():
       if gene in gene_amp_frequency.keys():
          gene_amp_frequency[gene] += 1
       else:
          gene_amp_frequency[gene] = 1

gene_del_frequency = dict()
for sample in samples.values():
   for gene in sample.listFocalDelGenes():
      if gene in gene_del_frequency.keys():
         gene_del_frequency[gene] += 1
      else:
         gene_del_frequency[gene] = 1

print ""
print " 8) Get 20 most frequent genes in focal amplifications/deletions"
print "   Amplifications:"
count = 0
for gene in sorted(gene_amp_frequency, key=gene_amp_frequency.get, reverse=True):
 count += 1
 if (gene in gene_positions.keys()):
    print "Gene: "+gene+"  Freq: "+str(gene_amp_frequency[gene])+"  Pos: "+gene_positions[gene].returnPos()
 else:
    print "Gene: "+gene+"  Freq: "+str(gene_amp_frequency[gene])
 if (count > 19):
   break
count = 0
print ""
print "   Deletions:"
for gene in sorted(gene_del_frequency, key=gene_del_frequency.get, reverse=True):
 count += 1
 if (gene in gene_positions.keys()):
    print "Gene: "+gene+"  Freq: "+str(gene_del_frequency[gene])+"  Pos: "+gene_positions[gene].returnPos()
 else:
    print "Gene: "+gene+"  Freq: "+str(gene_del_frequency[gene])
 if (count > 19):
   break









    



 8) Get 20 most frequent genes in focal amplifications/deletions
   Amplifications:
Gene: PPFIA1  Freq: 523  Pos: chr11:70116805-70224598
Gene: ANO1  Freq: 507  Pos: chr11:69931515-70035652
Gene: EGFR  Freq: 493  Pos: chr7:55086724-55275031
Gene: FADD  Freq: 478  Pos: chr11:70049268-70053508
Gene: CTTN  Freq: 476  Pos: chr11:70244611-70282690
Gene: CCND1  Freq: 466  Pos: chr11:69455872-69469242
Gene: ORAOV1  Freq: 464  Pos: chr11:69480331-69490165
Gene: WHSC1L1  Freq: 458  Pos: chr8:38173934-38239790
Gene: DDHD2  Freq: 441  Pos: chr8:38089470-38120287
Gene: FGFR1  Freq: 438  Pos: chr8:38268655-38326352
Gene: PPAPDC1B  Freq: 415  Pos: chr8:38120649-38126738
Gene: ASH2L  Freq: 411  Pos: chr8:37963310-37997598
Gene: ADAM9  Freq: 400  Pos: chr8:38854504-38962779
Gene: RAB11FIP1  Freq: 397  Pos: chr8:37716464-37757015
Gene: EIF4EBP1  Freq: 397  Pos: chr8:37888019-37917883
Gene: BAG4  Freq: 397  Pos: chr8:38034105-38070819
Gene: LSM1  Freq: 395  Pos: chr8:38020838-38034248
Gene: BRF2  Freq: 388  Pos: chr8:37701397-37707431
Gene: TM2D2  Freq: 387  Pos: chr8:38846326-38854041
Gene: KAT6A  Freq: 383  Pos: chr8:41786996-41909505

   Deletions:
Gene: CDKN2A  Freq: 703  Pos: chr9:21967750-21975132
Gene: STK11  Freq: 138  Pos: chr19:1205797-1228434
Gene: NF1  Freq: 128  Pos: chr17:29421944-29704695
Gene: RB1  Freq: 122  Pos: chr13:48877882-49056026
Gene: PTEN  Freq: 105  Pos: chr10:89623194-89728532
Gene: MAP2K4  Freq: 103  Pos: chr17:11924134-12047148
Gene: SMAD4  Freq: 94  Pos: chr18:48556582-48611411
Gene: CYLD  Freq: 56  Pos: chr16:50775960-50835846
Gene: TP53  Freq: 34  Pos: chr17:7571719-7590868
Gene: CDH1  Freq: 24  Pos: chr16:68771194-68869444
Gene: NF2  Freq: 22  Pos: chr22:29999544-30079904
Gene: SOCS1  Freq: 21  Pos: chr16:11348273-11350039
Gene: MSH2  Freq: 20  Pos: chr2:47630205-47710367
Gene: MSH6  Freq: 15  Pos: chr2:48010220-48034092
Gene: MLH1  Freq: 12  Pos: chr3:37035267-37092337

Check for cooccurence of focal amplifications in MYC,CDKN2A, FGFR1, CCND1, ERBB2, CCNE1

Numbers in diagonal of the table represent all gene amplifications found



In [22]:

    
# 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0

for sample in samples.values():
    if sample.checkFocalGeneAmp("MYC"):
        MYC_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            MYC_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("FGFR1"):
            MYC_and_FGFR1_count += 1
        if sample.checkFocalGeneAmp("CCND1"):
            MYC_and_CCND1_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            MYC_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            MYC_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("FGFR1"):
        FGFR1_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            FGFR1_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCND1"):
            FGFR1_and_CCND1_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            FGFR1_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            FGFR1_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("CCND1"):
        CCND1_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            CCND1_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            CCND1_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            CCND1_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("ERBB2"):
        ERBB2_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            ERBB2_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            ERBB2_and_CCNE1_count += 1
    if sample.checkFocalGeneDel("CDKN2A"):
        CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            CDKN2A_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("CCNE1"):
        CCNE1_count += 1

print ""
print " 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print "  MYC amplified: "+str(MYC_count)
print "  CDKNA2 deleted: "+str(CDKN2A_count)
print "  FGFR1 amplified: "+str(FGFR1_count)
print "  CCND1 amplified: "+str(CCND1_count)
print "  ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h









    



 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1

  MYC amplified: 253
  CDKNA2 deleted: 703
  FGFR1 amplified: 416
  CCND1 amplified: 464
  ERBB2 amplified: 238







    Out[22]:




MYC CDKN2A FGFR1 CCND1 ERBB2 CCNE1
MYC 253 23 27 26 24 59
CDKN2A 23 703 55 72 21 30
FGFR1 27 55 416 77 26 46
CCND1 26 72 77 464 42 19
ERBB2 24 21 26 42 238 37
CCNE1 59 30 46 19 37 355

Check for cooccurence of amplifications (copynumber > 4) in MYC,CDKN2A, FGFR1, CCND1, ERBB2, CCNE1



In [23]:

    
# 10) Check for cooccurence of amplifications (copynumber > 4) MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0

for sample in samples.values():
    log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
    if (log2_MYC != None):
       copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
       copynumber_MYC = "n/a"    
    log2_FGFR1 = sample.getLog2FromGene(gene_positions["FGFR1"].chrom,gene_positions["FGFR1"].start, gene_positions["FGFR1"].end)
    if (log2_FGFR1 != None):
       copynumber_FGFR1 = 2.0 * 2.0 ** log2_FGFR1
    else:
       copynumber_FGFR1 = "n/a"
    log2_CDKN2A = sample.getLog2FromGene(gene_positions["CDKN2A"].chrom,gene_positions["CDKN2A"].start, gene_positions["CDKN2A"].end)
    log2_CCND1 = sample.getLog2FromGene(gene_positions["CCND1"].chrom,gene_positions["CCND1"].start, gene_positions["CCND1"].end)
    if (log2_CCND1 != None):
       copynumber_CCND1 = 2.0 * 2.0 ** log2_CCND1
    else:
       copynumber_CCND1 = "n/a"
    log2_ERBB2 = sample.getLog2FromGene(gene_positions["ERBB2"].chrom,gene_positions["ERBB2"].start, gene_positions["ERBB2"].end)
    if (log2_ERBB2 != None):
       copynumber_ERBB2 = 2.0 * 2.0 ** log2_ERBB2
    else:
       copynumber_ERBB2 = "n/a"
    log2_CCNE1 = sample.getLog2FromGene(gene_positions["CCNE1"].chrom,gene_positions["CCNE1"].start, gene_positions["CCNE1"].end)
    if (log2_CCNE1 != None):
       copynumber_CCNE1 = 2.0 * 2.0 ** log2_CCNE1
    else:
       copynumber_CCNE1 = "n/a"
        
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        MYC_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            MYC_and_CDKN2A_count += 1
        if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
            MYC_and_FGFR1_count += 1
        if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
            MYC_and_CCND1_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            MYC_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            MYC_and_CCNE1_count += 1
    if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
        FGFR1_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            FGFR1_and_CDKN2A_count += 1
        if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
            FGFR1_and_CCND1_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            FGFR1_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            FGFR1_and_CCNE1_count += 1
    if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
        CCND1_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            CCND1_and_CDKN2A_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            CCND1_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            CCND1_and_CCNE1_count += 1
    if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
        ERBB2_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            ERBB2_and_CDKN2A_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            ERBB2_and_CCNE1_count += 1
    if log2_CDKN2A != None and log2_CDKN2A < -0.3:
        CDKN2A_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            CDKN2A_and_CCNE1_count += 1
    if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
        CCNE1_count += 1

print ""
print " 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print "  MYC amplified: "+str(MYC_count)
print "  CDKNA2 deleted: "+str(CDKN2A_count)
print "  FGFR1 amplified: "+str(FGFR1_count)
print "  CCND1 amplified: "+str(CCND1_count)
print "  ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h









    



 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1

  MYC amplified: 342
  CDKNA2 deleted: 1690
  FGFR1 amplified: 171
  CCND1 amplified: 333
  ERBB2 amplified: 161







    Out[23]:




MYC CDKN2A FGFR1 CCND1 ERBB2 CCNE1
MYC 342 113 21 32 20 18
CDKN2A 113 1690 78 156 54 55
FGFR1 21 78 171 30 8 9
CCND1 32 156 30 333 29 6
ERBB2 20 54 8 29 161 9
CCNE1 18 55 9 6 9 156

Check which samples would be called MYC amplified by which definition

Leiserson: Log2-ratio > 0.9
Amplitude: copynumber > 4 (means: log2-ratio > 1)
Size: smaller than 20Mbp, Log2-ratio > 0.2; log2-ratio > 0.2 higher than surrounding 20Mbp on either side



In [ ]:

    
size = list()
amplitude = list()
leiserson = list()

for sample in samples.values():
    if sample.checkFocalGeneAmp("MYC"):
        size.append(sample.id)
    log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
    if (log2_MYC != None):
       copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
       copynumber_MYC = "n/a"  
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        amplitude.append(sample.id)
    if log2_MYC > 0.9:
        leiserson.append(sample.id)

print "MYC amplification definition"
print "----------------------------"
print "MYC amplified defined by size and local amplitude: "+str(len(size))
print "MYC amplified defined by amplitude (Copynumber > 4): "+str(len(amplitude))
print "MYC amplified defined by Leiserson (Log2-Ratio > 0.9): "+str(len(leiserson))
plt.figure(figsize=(10,10))
matplotlib_venn.venn3([set(size),set(amplitude),set(leiserson)],set_labels=["Size", "Copynumber", "Leiserson"])

Which tissues do MYC focally amplified samples come from



In [3]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if sample.checkFocalGeneAmp("MYC"):
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_focal_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[3]:




Site Count Frequency
bladder 9 0.0381355932203
brain 4 0.0169491525424
breast 58 0.245762711864
colon 13 0.0550847457627
endometrial 27 0.114406779661
head and neck 9 0.0381355932203
lung 49 0.207627118644
nan 1 0.00423728813559
ovary 64 0.271186440678
rectum 2 0.00847457627119

Which tissues do MYC copynumber > 4 samples come from



In [13]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn4_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[13]:




Site Count Frequency
bladder 7 0.0211480362538
brain 5 0.0151057401813
breast 111 0.335347432024
colon 12 0.036253776435
endometrial 27 0.0815709969789
head and neck 5 0.0151057401813
lung 32 0.0966767371601
nan 2 0.00604229607251
ovary 124 0.374622356495
rectum 6 0.0181268882175

Which tissues do MYC copynumber > 4 and TP53 mutations samples come from



In [9]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[9]:




Site Count Frequency
brain 2 0.0136054421769
breast 60 0.408163265306
colon 4 0.0272108843537
endometrial 7 0.047619047619
head and neck 2 0.0136054421769
lung 10 0.0680272108844
ovary 58 0.394557823129
rectum 4 0.0272108843537

Which tissues do focal MYC and TP53 mutations samples come from



In [8]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[8]:




Site Count Frequency
brain 1 0.010752688172
breast 33 0.354838709677
colon 2 0.0215053763441
endometrial 5 0.0537634408602
head and neck 6 0.0645161290323
lung 15 0.161290322581
ovary 29 0.311827956989
rectum 2 0.0215053763441

Which tissues do MYC copynumber > 6 samples come from



In [7]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 6:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn6_tissue_sites.png', dpi=200)


count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[7]:




Site Count Frequency
bladder 3 0.0394736842105
brain 2 0.0263157894737
breast 18 0.236842105263
colon 1 0.0131578947368
endometrial 12 0.157894736842
head and neck 2 0.0263157894737
lung 14 0.184210526316
nan 1 0.0131578947368
ovary 22 0.289473684211
rectum 1 0.0131578947368

Which histological subtypes do MYC copynumber > 4 and TP53 mutations samples come from



In [6]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_histological_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[6]:




Site Count Frequency
brain untreated primary (de novo) gbm 2 0.0136054421769
breast infiltrating carcinoma nos 1 0.00680272108844
breast infiltrating ductal carcinoma 56 0.380952380952
breast infiltrating lobular carcinoma 1 0.00680272108844
breast nan 1 0.00680272108844
breast other  specify 1 0.00680272108844
colon colon adenocarcinoma 4 0.0272108843537
endometrial mixed serous and endometrioid 1 0.00680272108844
endometrial serous endometrial adenocarcinoma 6 0.0408163265306
head and neck head and neck squamous cell carcinoma 2 0.0136054421769
lung lung adenocarcinoma- not otherwise specified (nos) 2 0.0136054421769
lung lung papillary adenocarcinoma 1 0.00680272108844
lung lung squamous cell carcinoma- not otherwise specified (nos) 7 0.047619047619
ovary serous cystadenocarcinoma 58 0.394557823129
rectum rectal adenocarcinoma 3 0.0204081632653
rectum rectal mucinous adenocarcinoma 1 0.00680272108844

Which histological subtypes do focal MYC and TP53 mutations samples come from



In [5]:

    
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_histological_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h









    Out[5]:




Site Count Frequency
brain untreated primary (de novo) gbm 1 0.010752688172
breast infiltrating ductal carcinoma 29 0.311827956989
breast infiltrating lobular carcinoma 2 0.0215053763441
breast nan 1 0.010752688172
breast other  specify 1 0.010752688172
colon colon adenocarcinoma 2 0.0215053763441
endometrial mixed serous and endometrioid 1 0.010752688172
endometrial serous endometrial adenocarcinoma 4 0.0430107526882
head and neck head and neck squamous cell carcinoma 6 0.0645161290323
lung lung adenocarcinoma- not otherwise specified (nos) 4 0.0430107526882
lung lung squamous cell carcinoma- not otherwise specified (nos) 11 0.118279569892
ovary serous cystadenocarcinoma 29 0.311827956989
rectum rectal adenocarcinoma 2 0.0215053763441



In [ ]:

	MYC	CDKN2A	FGFR1	CCND1	ERBB2	CCNE1
MYC	253	23	27	26	24	59
CDKN2A	23	703	55	72	21	30
FGFR1	27	55	416	77	26	46
CCND1	26	72	77	464	42	19
ERBB2	24	21	26	42	238	37
CCNE1	59	30	46	19	37	355

	MYC	CDKN2A	FGFR1	CCND1	ERBB2	CCNE1
MYC	342	113	21	32	20	18
CDKN2A	113	1690	78	156	54	55
FGFR1	21	78	171	30	8	9
CCND1	32	156	30	333	29	6
ERBB2	20	54	8	29	161	9
CCNE1	18	55	9	6	9	156

Site	Count	Frequency
bladder	9	0.0381355932203
brain	4	0.0169491525424
breast	58	0.245762711864
colon	13	0.0550847457627
endometrial	27	0.114406779661
head and neck	9	0.0381355932203
lung	49	0.207627118644
nan	1	0.00423728813559
ovary	64	0.271186440678
rectum	2	0.00847457627119

Site	Count	Frequency
bladder	7	0.0211480362538
brain	5	0.0151057401813
breast	111	0.335347432024
colon	12	0.036253776435
endometrial	27	0.0815709969789
head and neck	5	0.0151057401813
lung	32	0.0966767371601
nan	2	0.00604229607251
ovary	124	0.374622356495
rectum	6	0.0181268882175

Site	Count	Frequency
brain	2	0.0136054421769
breast	60	0.408163265306
colon	4	0.0272108843537
endometrial	7	0.047619047619
head and neck	2	0.0136054421769
lung	10	0.0680272108844
ovary	58	0.394557823129
rectum	4	0.0272108843537

Site	Count	Frequency
brain	1	0.010752688172
breast	33	0.354838709677
colon	2	0.0215053763441
endometrial	5	0.0537634408602
head and neck	6	0.0645161290323
lung	15	0.161290322581
ovary	29	0.311827956989
rectum	2	0.0215053763441

Site	Count	Frequency
bladder	3	0.0394736842105
brain	2	0.0263157894737
breast	18	0.236842105263
colon	1	0.0131578947368
endometrial	12	0.157894736842
head and neck	2	0.0263157894737
lung	14	0.184210526316
nan	1	0.0131578947368
ovary	22	0.289473684211
rectum	1	0.0131578947368

Site	Count	Frequency
brain untreated primary (de novo) gbm	2	0.0136054421769
breast infiltrating carcinoma nos	1	0.00680272108844
breast infiltrating ductal carcinoma	56	0.380952380952
breast infiltrating lobular carcinoma	1	0.00680272108844
breast nan	1	0.00680272108844
breast other specify	1	0.00680272108844
colon colon adenocarcinoma	4	0.0272108843537
endometrial mixed serous and endometrioid	1	0.00680272108844
endometrial serous endometrial adenocarcinoma	6	0.0408163265306
head and neck head and neck squamous cell carcinoma	2	0.0136054421769
lung lung adenocarcinoma- not otherwise specified (nos)	2	0.0136054421769
lung lung papillary adenocarcinoma	1	0.00680272108844
lung lung squamous cell carcinoma- not otherwise specified (nos)	7	0.047619047619
ovary serous cystadenocarcinoma	58	0.394557823129
rectum rectal adenocarcinoma	3	0.0204081632653
rectum rectal mucinous adenocarcinoma	1	0.00680272108844