Get statistics for focal amplifications of Pan cancer samples


In [1]:
import NotebookImport
from TCGA_analysis_PanCancer_import import *
from IPython.display import HTML


importing IPython notebook from TCGA_analysis_PanCancer_import.ipynb
Loading Focal Amplification data
Loading RNASeq data
Loading RNASeq data of controls
Loading CNV Data
Loading Somatic Mutation Data
116 samples have somatic but no CNV data
Loading Clinical Data
Samples: 5737
  --Focal Data: 5737
  --CNV:        5737
  --RNASeq:     4196
  --Somatic:    3297
  --Clinical:   5312

Check lengths of segments having copynumber > 4 at significant targets found in Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers"


In [13]:
# 1) check lengths of segments having copynumber > 4 in known focal amplification genes (Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers", Supp Table2 Known Targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]

focal_sizes=list()
nonfocal_sizes = list()

for sample in samples.values():
   for gene in beroukhim_genes:
      if gene in gene_positions.keys():
         log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
         if (log2 != None):
            copynumber = 2.0 * 2.0 ** log2
         else:
            copynumber = "n/a"
         if copynumber != "n/a" and copynumber > 4:
            focal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
         if copynumber != "n/a" and copynumber < 4:
            nonfocal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))

fig, axs = plt.subplots(1,2,figsize=(12,3))
focal_sizes_series = pandas.Series(focal_sizes)
fig_focal=focal_sizes_series.plot(ax=axs[0],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[0].vlines(x=20000000, ymin=0, ymax=2000, color='r')
axs[0].set_xlabel("Size of target genes at copynumber > 4")
#output=fig_focal.get_figure()
#plt.show()
#output.savefig('BRCA_sizes_copynumber_over4_known.png')
mean_focal = focal_sizes_series.mean()
std_focal = focal_sizes_series.std()

nonfocal_sizes_series = pandas.Series(nonfocal_sizes)
fig_nonfocal=nonfocal_sizes_series.plot(ax=axs[1],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[1].vlines(x=20000000, ymin=0, ymax=10000, color='r')
axs[1].set_xlabel("Size of target genes at copynumber < 4")

#output1=fig_nonfocal.get_figure()
#plt.show()
#output1.savefig('BRCA_sizes_copynumber_lower4_known.png')
mean_nonfocal = nonfocal_sizes_series.mean()
std_nonfocal = nonfocal_sizes_series.std()

print ""
print " 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications"
print "   Segments copynumber > 4 and known focal amplifications:"
print "   Mean: "+str(mean_focal)
print "   Standard Deviation: "+str(std_focal)
print "   ---------------------------------"
print "   Segments copynumber < 4:"
print "   Mean: "+str(mean_nonfocal)
print "   Standard Deviation: "+str(std_nonfocal)
print "   ---------------------------------"


 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications
   Segments copynumber > 4 and known focal amplifications:
   Mean: 8149906.74884
   Standard Deviation: 14843540.2503
   ---------------------------------
   Segments copynumber < 4:
   Mean: 81214286.6467
   Standard Deviation: 58351966.4021
   ---------------------------------

Correlation of lengths and gene expression of MYC


In [14]:
# 2) correlation of lengths and gene expression of MYC

MYC_expression = list()
MYC_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
 if sample.getRNASeqFromGene("MYC") != "n/a" and size != None:
    MYC_expression.append(sample.getRNASeqFromGene("MYC"))
    MYC_size.append(size)

MYC_expr_tuple={'MYC expression': MYC_expression,'MYC size': MYC_size}
MYC_expr_size=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_size.plot(kind='scatter', x='MYC expression', y='MYC size')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_size)


print ""
print " 2) correlation of lengths and gene expression of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 2) correlation of lengths and gene expression of MYC
   Pearson Correlation coefficient: -0.149072168965
   P-value: 2.95643539727e-22
   -----------------------
   Spearman Correlation coefficient: -0.144418122635
   P-value: 5.67654647849e-21

Correlation of lengths and gene expression of CCND1


In [15]:
# 3) correlation of lengths and gene expression of CCND1

CCND1_expression = list()
CCND1_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 if sample.getRNASeqFromGene("CCND1") != "n/a" and size != None:
    CCND1_expression.append(sample.getRNASeqFromGene("CCND1"))
    CCND1_size.append(size)

CCND1_expr_tuple={'CCND1 expression': CCND1_expression,'CCND1 size': CCND1_size}
CCND1_expr_size=pandas.DataFrame(CCND1_expr_tuple)
fig_correlation= CCND1_expr_size.plot(kind='scatter', x='CCND1 expression', y='CCND1 size')
plt.hlines(y=20000000, xmin=-50000, xmax=250000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_expression,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_expression,CCND1_size)


print ""
print " 3) correlation of lengths and gene expression of CCND1"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 3) correlation of lengths and gene expression of CCND1
   Pearson Correlation coefficient: -0.149131685767
   P-value: 3.08056668406e-22
   -----------------------
   Spearman Correlation coefficient: -0.0756853726216
   P-value: 9.52583731327e-07

Correlation of lengths and gene expression of Beroukhim targets


In [16]:
# 4) correlation of lengths and gene expression of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_expression = list()
gene_size = list()

for sample in samples.values():
   for gene in beroukhim_genes:
      size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
      if sample.getRNASeqFromGene(gene) != "n/a" and size != None:
         gene_expression.append(sample.getRNASeqFromGene(gene))
         gene_size.append(size)

gene_expr_tuple={'Gene expression': gene_expression,'Gene size': gene_size}
gene_expr_size=pandas.DataFrame(gene_expr_tuple)
fig_correlation= gene_expr_size.plot(kind='scatter', x='Gene expression', y='Gene size')
plt.hlines(y=20000000, xmin=-50000, xmax=400000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(gene_expression,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_expression,gene_size)


print ""
print " 4) correlation of lengths and gene expression of Beroukhim targets"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 4) correlation of lengths and gene expression of Beroukhim targets
   Pearson Correlation coefficient: -0.0408145421637
   P-value: 1.30095602624e-45
   -----------------------
   Spearman Correlation coefficient: -0.062715299126
   P-value: 2.63416930919e-105

Correlation of lengths and copynumber of MYC


In [17]:
# 5) correlation of lengths and copynumber of MYC

MYC_copynumber = list()
MYC_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
 log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
 if (log2 != None):
    copynumber = 2.0 * 2.0 ** log2
 else:
    copynumber = "n/a"
 if copynumber != "n/a" and size != None:
    MYC_copynumber.append(copynumber)
    MYC_size.append(size)

MYC_copynumber_tuple={'MYC copynumber': MYC_copynumber,'MYC size': MYC_size}
MYC_copynumber_size=pandas.DataFrame(MYC_copynumber_tuple)
fig_correlation= MYC_copynumber_size.plot(kind='scatter', x='MYC copynumber', y='MYC size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_copynumber,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_copynumber,MYC_size)

print ""
print " 5) correlation of lengths and copynumber of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 5) correlation of lengths and copynumber of MYC
   Pearson Correlation coefficient: -0.379700649899
   P-value: 6.26746645327e-196
   -----------------------
   Spearman Correlation coefficient: -0.489832928527
   P-value: 0.0

Correlation of lengths and copynumber of CCND1


In [18]:
# 6) correlation of lengths and copynumber of CCND1

CCND1_copynumber = list()
CCND1_size = list()

for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 log2 = sample.getLog2FromGene(gene_positions['CCND1'].chrom,gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 if (log2 != None):
    copynumber = 2.0 * 2.0 ** log2
 else:
    copynumber = "n/a"
 if copynumber != "n/a" and size != None:
    CCND1_copynumber.append(copynumber)
    CCND1_size.append(size)

CCND1_copynumber_tuple={'CCND1 copynumber': CCND1_copynumber,'CCND1 size': CCND1_size}
CCND1_copynumber_size=pandas.DataFrame(CCND1_copynumber_tuple)
fig_correlation= CCND1_copynumber_size.plot(kind='scatter', x='CCND1 copynumber', y='CCND1 size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_copynumber,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_copynumber,CCND1_size)

print ""
print " 6) correlation of lengths and copynumber of CCND1"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 6) correlation of lengths and copynumber of CCND1
   Pearson Correlation coefficient: -0.376256824721
   P-value: 6.98901466695e-192
   -----------------------
   Spearman Correlation coefficient: -0.397137348017
   P-value: 1.70716829422e-215

Correlation of lengths and copynumber of Beroukhim targets


In [20]:
# 7) correlation of lengths and copynumber of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_copynumber = list()
gene_size = list()

for sample in samples.values():
   for gene in beroukhim_genes:
      size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
      log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
      if (log2 != None):
         copynumber = 2.0 * 2.0 ** log2
      else:
         copynumber = "n/a"
      if copynumber != "n/a" and size != None:
         gene_copynumber.append(copynumber)
         gene_size.append(size)

gene_copynumber_tuple={'Gene copynumber': gene_copynumber,'Gene size': gene_size}
gene_copynumber_size=pandas.DataFrame(gene_copynumber_tuple)
fig_correlation= gene_copynumber_size.plot(kind='scatter', x='Gene copynumber', y='Gene size')
plt.hlines(y=20000000, xmin=0, xmax=50, color='r')
plt.vlines(x=4, ymin=0, ymax=250000000, color='g')

output1=fig_correlation.get_figure()
#plt.show()
output1.savefig('PanCancer_copynumber_vs_size_beroukhim_targets.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(gene_copynumber,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_copynumber,gene_size)


print ""
print " 7) correlation of lengths and copynumber of Beroukhim targets"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 7) correlation of lengths and copynumber of Beroukhim targets
   Pearson Correlation coefficient: -0.167305791851
   P-value: 0.0
   -----------------------
   Spearman Correlation coefficient: -0.172442408185
   P-value: 0.0

Correlation of copynumber and gene expression of MYC


In [4]:
# 8) correlation of lengths and gene expression of MYC

MYC_expression = list()
MYC_copynumber = list()

for sample in samples.values():
   size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
   log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
   if (log2 != None):
      copynumber = 2.0 * 2.0 ** log2
   else:
      copynumber = "n/a"
   if sample.getRNASeqFromGene("MYC") != "n/a" and copynumber != "n/a":
      MYC_expression.append(sample.getRNASeqFromGene("MYC"))
      MYC_copynumber.append(size)

MYC_expr_tuple={'MYC expression': MYC_expression,'MYC copynumber': MYC_copynumber}
MYC_expr_copynumber=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_copynumber.plot(kind='scatter', x='MYC expression', y='MYC copynumber')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_copynumber)


print ""
print " 8) correlation of copynumber and gene expression of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)


 8) correlation of copynumber and gene expression of MYC
   Pearson Correlation coefficient: -0.149072168965
   P-value: 2.95643539727e-22
   -----------------------
   Spearman Correlation coefficient: -0.144418122635
   P-value: 5.67654647849e-21

Check occurrence of focal amplifications of Beroukhim targets


In [20]:
# 7) Check occurrence of focal amplifications of Beroukhim targets
beroukhim_genes={"MYC" : 0,"CCND1" : 0,"ERBB2" : 0,"CDK4" : 0,"NKX2-1" : 0,"MDM2" : 0,"EGFR" : 0,"MCL1" : 0,"FGFR1" : 0,"KRAS" : 0,"CCNE1" : 0,"CRKL" : 0,"HMGA2" : 0,"TERT" : 0,"PRKCI" : 0,"IGF1R" : 0,"MYCL" : 0,"MYCN" : 0,"CDK6" : 0,"BCL2L1" : 0,"MYB" : 0,"MET" : 0,"JUN" : 0,"BIRC2" : 0,"YAP1" : 0,"PDGFRA" : 0,"KIT" : 0,"PIK3CA" : 0,"MDM4" : 0,"AR" : 0}

for sample in samples.values():
   for gene in beroukhim_genes.keys():
      if sample.checkFocalGeneAmp(gene):
         beroukhim_genes[gene] += 1   

focal_amp_series=pandas.Series(beroukhim_genes)
focal_amp_series.sort(ascending=False)
fig_correlation= focal_amp_series.plot(kind='bar')

print ""
print " 7) Check occurrence of focal amplifications of Beroukhim targets"
print ""


 7) Check occurrence of focal amplifications of Beroukhim targets

Get 20 most frequent genes in focal events


In [21]:
# 8) Get 50 most frequent genes in focal events
gene_amp_frequency = dict()
for sample in samples.values():
   for gene in sample.listFocalAmpGenes():
       if gene in gene_amp_frequency.keys():
          gene_amp_frequency[gene] += 1
       else:
          gene_amp_frequency[gene] = 1

gene_del_frequency = dict()
for sample in samples.values():
   for gene in sample.listFocalDelGenes():
      if gene in gene_del_frequency.keys():
         gene_del_frequency[gene] += 1
      else:
         gene_del_frequency[gene] = 1

print ""
print " 8) Get 20 most frequent genes in focal amplifications/deletions"
print "   Amplifications:"
count = 0
for gene in sorted(gene_amp_frequency, key=gene_amp_frequency.get, reverse=True):
 count += 1
 if (gene in gene_positions.keys()):
    print "Gene: "+gene+"  Freq: "+str(gene_amp_frequency[gene])+"  Pos: "+gene_positions[gene].returnPos()
 else:
    print "Gene: "+gene+"  Freq: "+str(gene_amp_frequency[gene])
 if (count > 19):
   break
count = 0
print ""
print "   Deletions:"
for gene in sorted(gene_del_frequency, key=gene_del_frequency.get, reverse=True):
 count += 1
 if (gene in gene_positions.keys()):
    print "Gene: "+gene+"  Freq: "+str(gene_del_frequency[gene])+"  Pos: "+gene_positions[gene].returnPos()
 else:
    print "Gene: "+gene+"  Freq: "+str(gene_del_frequency[gene])
 if (count > 19):
   break


 8) Get 20 most frequent genes in focal amplifications/deletions
   Amplifications:
Gene: PPFIA1  Freq: 523  Pos: chr11:70116805-70224598
Gene: ANO1  Freq: 507  Pos: chr11:69931515-70035652
Gene: EGFR  Freq: 493  Pos: chr7:55086724-55275031
Gene: FADD  Freq: 478  Pos: chr11:70049268-70053508
Gene: CTTN  Freq: 476  Pos: chr11:70244611-70282690
Gene: CCND1  Freq: 466  Pos: chr11:69455872-69469242
Gene: ORAOV1  Freq: 464  Pos: chr11:69480331-69490165
Gene: WHSC1L1  Freq: 458  Pos: chr8:38173934-38239790
Gene: DDHD2  Freq: 441  Pos: chr8:38089470-38120287
Gene: FGFR1  Freq: 438  Pos: chr8:38268655-38326352
Gene: PPAPDC1B  Freq: 415  Pos: chr8:38120649-38126738
Gene: ASH2L  Freq: 411  Pos: chr8:37963310-37997598
Gene: ADAM9  Freq: 400  Pos: chr8:38854504-38962779
Gene: RAB11FIP1  Freq: 397  Pos: chr8:37716464-37757015
Gene: EIF4EBP1  Freq: 397  Pos: chr8:37888019-37917883
Gene: BAG4  Freq: 397  Pos: chr8:38034105-38070819
Gene: LSM1  Freq: 395  Pos: chr8:38020838-38034248
Gene: BRF2  Freq: 388  Pos: chr8:37701397-37707431
Gene: TM2D2  Freq: 387  Pos: chr8:38846326-38854041
Gene: KAT6A  Freq: 383  Pos: chr8:41786996-41909505

   Deletions:
Gene: CDKN2A  Freq: 703  Pos: chr9:21967750-21975132
Gene: STK11  Freq: 138  Pos: chr19:1205797-1228434
Gene: NF1  Freq: 128  Pos: chr17:29421944-29704695
Gene: RB1  Freq: 122  Pos: chr13:48877882-49056026
Gene: PTEN  Freq: 105  Pos: chr10:89623194-89728532
Gene: MAP2K4  Freq: 103  Pos: chr17:11924134-12047148
Gene: SMAD4  Freq: 94  Pos: chr18:48556582-48611411
Gene: CYLD  Freq: 56  Pos: chr16:50775960-50835846
Gene: TP53  Freq: 34  Pos: chr17:7571719-7590868
Gene: CDH1  Freq: 24  Pos: chr16:68771194-68869444
Gene: NF2  Freq: 22  Pos: chr22:29999544-30079904
Gene: SOCS1  Freq: 21  Pos: chr16:11348273-11350039
Gene: MSH2  Freq: 20  Pos: chr2:47630205-47710367
Gene: MSH6  Freq: 15  Pos: chr2:48010220-48034092
Gene: MLH1  Freq: 12  Pos: chr3:37035267-37092337

Check for cooccurence of focal amplifications in MYC,CDKN2A, FGFR1, CCND1, ERBB2, CCNE1

Numbers in diagonal of the table represent all gene amplifications found


In [22]:
# 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0

for sample in samples.values():
    if sample.checkFocalGeneAmp("MYC"):
        MYC_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            MYC_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("FGFR1"):
            MYC_and_FGFR1_count += 1
        if sample.checkFocalGeneAmp("CCND1"):
            MYC_and_CCND1_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            MYC_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            MYC_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("FGFR1"):
        FGFR1_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            FGFR1_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCND1"):
            FGFR1_and_CCND1_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            FGFR1_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            FGFR1_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("CCND1"):
        CCND1_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            CCND1_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            CCND1_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            CCND1_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("ERBB2"):
        ERBB2_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            ERBB2_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            ERBB2_and_CCNE1_count += 1
    if sample.checkFocalGeneDel("CDKN2A"):
        CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            CDKN2A_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("CCNE1"):
        CCNE1_count += 1

print ""
print " 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print "  MYC amplified: "+str(MYC_count)
print "  CDKNA2 deleted: "+str(CDKN2A_count)
print "  FGFR1 amplified: "+str(FGFR1_count)
print "  CCND1 amplified: "+str(CCND1_count)
print "  ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h


 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1

  MYC amplified: 253
  CDKNA2 deleted: 703
  FGFR1 amplified: 416
  CCND1 amplified: 464
  ERBB2 amplified: 238

Out[22]:
MYCCDKN2AFGFR1CCND1ERBB2CCNE1
MYC2532327262459
CDKN2A2370355722130
FGFR12755416772646
CCND12672774644219
ERBB22421264223837
CCNE15930461937355

Check for cooccurence of amplifications (copynumber > 4) in MYC,CDKN2A, FGFR1, CCND1, ERBB2, CCNE1


In [23]:
# 10) Check for cooccurence of amplifications (copynumber > 4) MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0

for sample in samples.values():
    log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
    if (log2_MYC != None):
       copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
       copynumber_MYC = "n/a"    
    log2_FGFR1 = sample.getLog2FromGene(gene_positions["FGFR1"].chrom,gene_positions["FGFR1"].start, gene_positions["FGFR1"].end)
    if (log2_FGFR1 != None):
       copynumber_FGFR1 = 2.0 * 2.0 ** log2_FGFR1
    else:
       copynumber_FGFR1 = "n/a"
    log2_CDKN2A = sample.getLog2FromGene(gene_positions["CDKN2A"].chrom,gene_positions["CDKN2A"].start, gene_positions["CDKN2A"].end)
    log2_CCND1 = sample.getLog2FromGene(gene_positions["CCND1"].chrom,gene_positions["CCND1"].start, gene_positions["CCND1"].end)
    if (log2_CCND1 != None):
       copynumber_CCND1 = 2.0 * 2.0 ** log2_CCND1
    else:
       copynumber_CCND1 = "n/a"
    log2_ERBB2 = sample.getLog2FromGene(gene_positions["ERBB2"].chrom,gene_positions["ERBB2"].start, gene_positions["ERBB2"].end)
    if (log2_ERBB2 != None):
       copynumber_ERBB2 = 2.0 * 2.0 ** log2_ERBB2
    else:
       copynumber_ERBB2 = "n/a"
    log2_CCNE1 = sample.getLog2FromGene(gene_positions["CCNE1"].chrom,gene_positions["CCNE1"].start, gene_positions["CCNE1"].end)
    if (log2_CCNE1 != None):
       copynumber_CCNE1 = 2.0 * 2.0 ** log2_CCNE1
    else:
       copynumber_CCNE1 = "n/a"
        
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        MYC_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            MYC_and_CDKN2A_count += 1
        if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
            MYC_and_FGFR1_count += 1
        if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
            MYC_and_CCND1_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            MYC_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            MYC_and_CCNE1_count += 1
    if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
        FGFR1_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            FGFR1_and_CDKN2A_count += 1
        if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
            FGFR1_and_CCND1_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            FGFR1_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            FGFR1_and_CCNE1_count += 1
    if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
        CCND1_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            CCND1_and_CDKN2A_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            CCND1_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            CCND1_and_CCNE1_count += 1
    if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
        ERBB2_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            ERBB2_and_CDKN2A_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            ERBB2_and_CCNE1_count += 1
    if log2_CDKN2A != None and log2_CDKN2A < -0.3:
        CDKN2A_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            CDKN2A_and_CCNE1_count += 1
    if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
        CCNE1_count += 1

print ""
print " 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print "  MYC amplified: "+str(MYC_count)
print "  CDKNA2 deleted: "+str(CDKN2A_count)
print "  FGFR1 amplified: "+str(FGFR1_count)
print "  CCND1 amplified: "+str(CCND1_count)
print "  ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h


 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1

  MYC amplified: 342
  CDKNA2 deleted: 1690
  FGFR1 amplified: 171
  CCND1 amplified: 333
  ERBB2 amplified: 161

Out[23]:
MYCCDKN2AFGFR1CCND1ERBB2CCNE1
MYC34211321322018
CDKN2A1131690781565455
FGFR121781713089
CCND13215630333296
ERBB220548291619
CCNE11855969156

Check which samples would be called MYC amplified by which definition

Leiserson: Log2-ratio > 0.9
Amplitude: copynumber > 4 (means: log2-ratio > 1)
Size: smaller than 20Mbp, Log2-ratio > 0.2; log2-ratio > 0.2 higher than surrounding 20Mbp on either side


In [ ]:
size = list()
amplitude = list()
leiserson = list()

for sample in samples.values():
    if sample.checkFocalGeneAmp("MYC"):
        size.append(sample.id)
    log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
    if (log2_MYC != None):
       copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
       copynumber_MYC = "n/a"  
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        amplitude.append(sample.id)
    if log2_MYC > 0.9:
        leiserson.append(sample.id)

print "MYC amplification definition"
print "----------------------------"
print "MYC amplified defined by size and local amplitude: "+str(len(size))
print "MYC amplified defined by amplitude (Copynumber > 4): "+str(len(amplitude))
print "MYC amplified defined by Leiserson (Log2-Ratio > 0.9): "+str(len(leiserson))
plt.figure(figsize=(10,10))
matplotlib_venn.venn3([set(size),set(amplitude),set(leiserson)],set_labels=["Size", "Copynumber", "Leiserson"])

Which tissues do MYC focally amplified samples come from


In [3]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if sample.checkFocalGeneAmp("MYC"):
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_focal_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[3]:
SiteCountFrequency
bladder90.0381355932203
brain40.0169491525424
breast580.245762711864
colon130.0550847457627
endometrial270.114406779661
head and neck90.0381355932203
lung490.207627118644
nan10.00423728813559
ovary640.271186440678
rectum20.00847457627119

Which tissues do MYC copynumber > 4 samples come from


In [13]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn4_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[13]:
SiteCountFrequency
bladder70.0211480362538
brain50.0151057401813
breast1110.335347432024
colon120.036253776435
endometrial270.0815709969789
head and neck50.0151057401813
lung320.0966767371601
nan20.00604229607251
ovary1240.374622356495
rectum60.0181268882175

Which tissues do MYC copynumber > 4 and TP53 mutations samples come from


In [9]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[9]:
SiteCountFrequency
brain20.0136054421769
breast600.408163265306
colon40.0272108843537
endometrial70.047619047619
head and neck20.0136054421769
lung100.0680272108844
ovary580.394557823129
rectum40.0272108843537

Which tissues do focal MYC and TP53 mutations samples come from


In [8]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_tissue_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[8]:
SiteCountFrequency
brain10.010752688172
breast330.354838709677
colon20.0215053763441
endometrial50.0537634408602
head and neck60.0645161290323
lung150.161290322581
ovary290.311827956989
rectum20.0215053763441

Which tissues do MYC copynumber > 6 samples come from


In [7]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 6:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn6_tissue_sites.png', dpi=200)


count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[7]:
SiteCountFrequency
bladder30.0394736842105
brain20.0263157894737
breast180.236842105263
colon10.0131578947368
endometrial120.157894736842
head and neck20.0263157894737
lung140.184210526316
nan10.0131578947368
ovary220.289473684211
rectum10.0131578947368

Which histological subtypes do MYC copynumber > 4 and TP53 mutations samples come from


In [6]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_histological_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[6]:
SiteCountFrequency
brain untreated primary (de novo) gbm20.0136054421769
breast infiltrating carcinoma nos10.00680272108844
breast infiltrating ductal carcinoma560.380952380952
breast infiltrating lobular carcinoma10.00680272108844
breast nan10.00680272108844
breast other specify10.00680272108844
colon colon adenocarcinoma40.0272108843537
endometrial mixed serous and endometrioid10.00680272108844
endometrial serous endometrial adenocarcinoma60.0408163265306
head and neck head and neck squamous cell carcinoma20.0136054421769
lung lung adenocarcinoma- not otherwise specified (nos)20.0136054421769
lung lung papillary adenocarcinoma10.00680272108844
lung lung squamous cell carcinoma- not otherwise specified (nos)70.047619047619
ovary serous cystadenocarcinoma580.394557823129
rectum rectal adenocarcinoma30.0204081632653
rectum rectal mucinous adenocarcinoma10.00680272108844

Which histological subtypes do focal MYC and TP53 mutations samples come from


In [5]:
source_sites = list()

for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')

output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_histological_sites.png', dpi=200)

count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h


Out[5]:
SiteCountFrequency
brain untreated primary (de novo) gbm10.010752688172
breast infiltrating ductal carcinoma290.311827956989
breast infiltrating lobular carcinoma20.0215053763441
breast nan10.010752688172
breast other specify10.010752688172
colon colon adenocarcinoma20.0215053763441
endometrial mixed serous and endometrioid10.010752688172
endometrial serous endometrial adenocarcinoma40.0430107526882
head and neck head and neck squamous cell carcinoma60.0645161290323
lung lung adenocarcinoma- not otherwise specified (nos)40.0430107526882
lung lung squamous cell carcinoma- not otherwise specified (nos)110.118279569892
ovary serous cystadenocarcinoma290.311827956989
rectum rectal adenocarcinoma20.0215053763441

In [ ]: