In [1]:
    
import NotebookImport
from TCGA_analysis_PanCancer_import import *
from IPython.display import HTML
    
    
In [13]:
    
# 1) check lengths of segments having copynumber > 4 in known focal amplification genes (Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers", Supp Table2 Known Targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
focal_sizes=list()
nonfocal_sizes = list()
for sample in samples.values():
   for gene in beroukhim_genes:
      if gene in gene_positions.keys():
         log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
         if (log2 != None):
            copynumber = 2.0 * 2.0 ** log2
         else:
            copynumber = "n/a"
         if copynumber != "n/a" and copynumber > 4:
            focal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
         if copynumber != "n/a" and copynumber < 4:
            nonfocal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
fig, axs = plt.subplots(1,2,figsize=(12,3))
focal_sizes_series = pandas.Series(focal_sizes)
fig_focal=focal_sizes_series.plot(ax=axs[0],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[0].vlines(x=20000000, ymin=0, ymax=2000, color='r')
axs[0].set_xlabel("Size of target genes at copynumber > 4")
#output=fig_focal.get_figure()
#plt.show()
#output.savefig('BRCA_sizes_copynumber_over4_known.png')
mean_focal = focal_sizes_series.mean()
std_focal = focal_sizes_series.std()
nonfocal_sizes_series = pandas.Series(nonfocal_sizes)
fig_nonfocal=nonfocal_sizes_series.plot(ax=axs[1],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[1].vlines(x=20000000, ymin=0, ymax=10000, color='r')
axs[1].set_xlabel("Size of target genes at copynumber < 4")
#output1=fig_nonfocal.get_figure()
#plt.show()
#output1.savefig('BRCA_sizes_copynumber_lower4_known.png')
mean_nonfocal = nonfocal_sizes_series.mean()
std_nonfocal = nonfocal_sizes_series.std()
print ""
print " 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications"
print "   Segments copynumber > 4 and known focal amplifications:"
print "   Mean: "+str(mean_focal)
print "   Standard Deviation: "+str(std_focal)
print "   ---------------------------------"
print "   Segments copynumber < 4:"
print "   Mean: "+str(mean_nonfocal)
print "   Standard Deviation: "+str(std_nonfocal)
print "   ---------------------------------"
    
    
    
In [14]:
    
# 2) correlation of lengths and gene expression of MYC
MYC_expression = list()
MYC_size = list()
for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
 if sample.getRNASeqFromGene("MYC") != "n/a" and size != None:
    MYC_expression.append(sample.getRNASeqFromGene("MYC"))
    MYC_size.append(size)
MYC_expr_tuple={'MYC expression': MYC_expression,'MYC size': MYC_size}
MYC_expr_size=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_size.plot(kind='scatter', x='MYC expression', y='MYC size')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_size)
print ""
print " 2) correlation of lengths and gene expression of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [15]:
    
# 3) correlation of lengths and gene expression of CCND1
CCND1_expression = list()
CCND1_size = list()
for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 if sample.getRNASeqFromGene("CCND1") != "n/a" and size != None:
    CCND1_expression.append(sample.getRNASeqFromGene("CCND1"))
    CCND1_size.append(size)
CCND1_expr_tuple={'CCND1 expression': CCND1_expression,'CCND1 size': CCND1_size}
CCND1_expr_size=pandas.DataFrame(CCND1_expr_tuple)
fig_correlation= CCND1_expr_size.plot(kind='scatter', x='CCND1 expression', y='CCND1 size')
plt.hlines(y=20000000, xmin=-50000, xmax=250000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_expression,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_expression,CCND1_size)
print ""
print " 3) correlation of lengths and gene expression of CCND1"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [16]:
    
# 4) correlation of lengths and gene expression of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_expression = list()
gene_size = list()
for sample in samples.values():
   for gene in beroukhim_genes:
      size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
      if sample.getRNASeqFromGene(gene) != "n/a" and size != None:
         gene_expression.append(sample.getRNASeqFromGene(gene))
         gene_size.append(size)
gene_expr_tuple={'Gene expression': gene_expression,'Gene size': gene_size}
gene_expr_size=pandas.DataFrame(gene_expr_tuple)
fig_correlation= gene_expr_size.plot(kind='scatter', x='Gene expression', y='Gene size')
plt.hlines(y=20000000, xmin=-50000, xmax=400000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(gene_expression,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_expression,gene_size)
print ""
print " 4) correlation of lengths and gene expression of Beroukhim targets"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [17]:
    
# 5) correlation of lengths and copynumber of MYC
MYC_copynumber = list()
MYC_size = list()
for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
 log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
 if (log2 != None):
    copynumber = 2.0 * 2.0 ** log2
 else:
    copynumber = "n/a"
 if copynumber != "n/a" and size != None:
    MYC_copynumber.append(copynumber)
    MYC_size.append(size)
MYC_copynumber_tuple={'MYC copynumber': MYC_copynumber,'MYC size': MYC_size}
MYC_copynumber_size=pandas.DataFrame(MYC_copynumber_tuple)
fig_correlation= MYC_copynumber_size.plot(kind='scatter', x='MYC copynumber', y='MYC size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_copynumber,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_copynumber,MYC_size)
print ""
print " 5) correlation of lengths and copynumber of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [18]:
    
# 6) correlation of lengths and copynumber of CCND1
CCND1_copynumber = list()
CCND1_size = list()
for sample in samples.values():
 size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 log2 = sample.getLog2FromGene(gene_positions['CCND1'].chrom,gene_positions['CCND1'].start, gene_positions['CCND1'].end)
 if (log2 != None):
    copynumber = 2.0 * 2.0 ** log2
 else:
    copynumber = "n/a"
 if copynumber != "n/a" and size != None:
    CCND1_copynumber.append(copynumber)
    CCND1_size.append(size)
CCND1_copynumber_tuple={'CCND1 copynumber': CCND1_copynumber,'CCND1 size': CCND1_size}
CCND1_copynumber_size=pandas.DataFrame(CCND1_copynumber_tuple)
fig_correlation= CCND1_copynumber_size.plot(kind='scatter', x='CCND1 copynumber', y='CCND1 size')
plt.hlines(y=20000000, xmin=1, xmax=16, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_copynumber,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_copynumber,CCND1_size)
print ""
print " 6) correlation of lengths and copynumber of CCND1"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [20]:
    
# 7) correlation of lengths and copynumber of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_copynumber = list()
gene_size = list()
for sample in samples.values():
   for gene in beroukhim_genes:
      size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
      log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
      if (log2 != None):
         copynumber = 2.0 * 2.0 ** log2
      else:
         copynumber = "n/a"
      if copynumber != "n/a" and size != None:
         gene_copynumber.append(copynumber)
         gene_size.append(size)
gene_copynumber_tuple={'Gene copynumber': gene_copynumber,'Gene size': gene_size}
gene_copynumber_size=pandas.DataFrame(gene_copynumber_tuple)
fig_correlation= gene_copynumber_size.plot(kind='scatter', x='Gene copynumber', y='Gene size')
plt.hlines(y=20000000, xmin=0, xmax=50, color='r')
plt.vlines(x=4, ymin=0, ymax=250000000, color='g')
output1=fig_correlation.get_figure()
#plt.show()
output1.savefig('PanCancer_copynumber_vs_size_beroukhim_targets.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(gene_copynumber,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_copynumber,gene_size)
print ""
print " 7) correlation of lengths and copynumber of Beroukhim targets"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [4]:
    
# 8) correlation of lengths and gene expression of MYC
MYC_expression = list()
MYC_copynumber = list()
for sample in samples.values():
   size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
   log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
   if (log2 != None):
      copynumber = 2.0 * 2.0 ** log2
   else:
      copynumber = "n/a"
   if sample.getRNASeqFromGene("MYC") != "n/a" and copynumber != "n/a":
      MYC_expression.append(sample.getRNASeqFromGene("MYC"))
      MYC_copynumber.append(size)
MYC_expr_tuple={'MYC expression': MYC_expression,'MYC copynumber': MYC_copynumber}
MYC_expr_copynumber=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_copynumber.plot(kind='scatter', x='MYC expression', y='MYC copynumber')
plt.hlines(y=20000000, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_copynumber)
print ""
print " 8) correlation of copynumber and gene expression of MYC"
print "   Pearson Correlation coefficient: "+str(corr_coeff)
print "   P-value: "+str(p_value)
print "   -----------------------"
print "   Spearman Correlation coefficient: "+str(corr_coeff_spear)
print "   P-value: "+str(p_value_spear)
    
    
    
In [20]:
    
# 7) Check occurrence of focal amplifications of Beroukhim targets
beroukhim_genes={"MYC" : 0,"CCND1" : 0,"ERBB2" : 0,"CDK4" : 0,"NKX2-1" : 0,"MDM2" : 0,"EGFR" : 0,"MCL1" : 0,"FGFR1" : 0,"KRAS" : 0,"CCNE1" : 0,"CRKL" : 0,"HMGA2" : 0,"TERT" : 0,"PRKCI" : 0,"IGF1R" : 0,"MYCL" : 0,"MYCN" : 0,"CDK6" : 0,"BCL2L1" : 0,"MYB" : 0,"MET" : 0,"JUN" : 0,"BIRC2" : 0,"YAP1" : 0,"PDGFRA" : 0,"KIT" : 0,"PIK3CA" : 0,"MDM4" : 0,"AR" : 0}
for sample in samples.values():
   for gene in beroukhim_genes.keys():
      if sample.checkFocalGeneAmp(gene):
         beroukhim_genes[gene] += 1   
focal_amp_series=pandas.Series(beroukhim_genes)
focal_amp_series.sort(ascending=False)
fig_correlation= focal_amp_series.plot(kind='bar')
print ""
print " 7) Check occurrence of focal amplifications of Beroukhim targets"
print ""
    
    
    
In [21]:
    
# 8) Get 50 most frequent genes in focal events
gene_amp_frequency = dict()
for sample in samples.values():
   for gene in sample.listFocalAmpGenes():
       if gene in gene_amp_frequency.keys():
          gene_amp_frequency[gene] += 1
       else:
          gene_amp_frequency[gene] = 1
gene_del_frequency = dict()
for sample in samples.values():
   for gene in sample.listFocalDelGenes():
      if gene in gene_del_frequency.keys():
         gene_del_frequency[gene] += 1
      else:
         gene_del_frequency[gene] = 1
print ""
print " 8) Get 20 most frequent genes in focal amplifications/deletions"
print "   Amplifications:"
count = 0
for gene in sorted(gene_amp_frequency, key=gene_amp_frequency.get, reverse=True):
 count += 1
 if (gene in gene_positions.keys()):
    print "Gene: "+gene+"  Freq: "+str(gene_amp_frequency[gene])+"  Pos: "+gene_positions[gene].returnPos()
 else:
    print "Gene: "+gene+"  Freq: "+str(gene_amp_frequency[gene])
 if (count > 19):
   break
count = 0
print ""
print "   Deletions:"
for gene in sorted(gene_del_frequency, key=gene_del_frequency.get, reverse=True):
 count += 1
 if (gene in gene_positions.keys()):
    print "Gene: "+gene+"  Freq: "+str(gene_del_frequency[gene])+"  Pos: "+gene_positions[gene].returnPos()
 else:
    print "Gene: "+gene+"  Freq: "+str(gene_del_frequency[gene])
 if (count > 19):
   break
    
    
Numbers in diagonal of the table represent all gene amplifications found
In [22]:
    
# 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0
for sample in samples.values():
    if sample.checkFocalGeneAmp("MYC"):
        MYC_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            MYC_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("FGFR1"):
            MYC_and_FGFR1_count += 1
        if sample.checkFocalGeneAmp("CCND1"):
            MYC_and_CCND1_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            MYC_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            MYC_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("FGFR1"):
        FGFR1_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            FGFR1_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCND1"):
            FGFR1_and_CCND1_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            FGFR1_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            FGFR1_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("CCND1"):
        CCND1_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            CCND1_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("ERBB2"):
            CCND1_and_ERBB2_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            CCND1_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("ERBB2"):
        ERBB2_count += 1
        if sample.checkFocalGeneDel("CDKN2A"):
            ERBB2_and_CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            ERBB2_and_CCNE1_count += 1
    if sample.checkFocalGeneDel("CDKN2A"):
        CDKN2A_count += 1
        if sample.checkFocalGeneAmp("CCNE1"):
            CDKN2A_and_CCNE1_count += 1
    if sample.checkFocalGeneAmp("CCNE1"):
        CCNE1_count += 1
print ""
print " 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print "  MYC amplified: "+str(MYC_count)
print "  CDKNA2 deleted: "+str(CDKN2A_count)
print "  FGFR1 amplified: "+str(FGFR1_count)
print "  CCND1 amplified: "+str(CCND1_count)
print "  ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h
    
    
    Out[22]:
In [23]:
    
# 10) Check for cooccurence of amplifications (copynumber > 4) MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0
for sample in samples.values():
    log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
    if (log2_MYC != None):
       copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
       copynumber_MYC = "n/a"    
    log2_FGFR1 = sample.getLog2FromGene(gene_positions["FGFR1"].chrom,gene_positions["FGFR1"].start, gene_positions["FGFR1"].end)
    if (log2_FGFR1 != None):
       copynumber_FGFR1 = 2.0 * 2.0 ** log2_FGFR1
    else:
       copynumber_FGFR1 = "n/a"
    log2_CDKN2A = sample.getLog2FromGene(gene_positions["CDKN2A"].chrom,gene_positions["CDKN2A"].start, gene_positions["CDKN2A"].end)
    log2_CCND1 = sample.getLog2FromGene(gene_positions["CCND1"].chrom,gene_positions["CCND1"].start, gene_positions["CCND1"].end)
    if (log2_CCND1 != None):
       copynumber_CCND1 = 2.0 * 2.0 ** log2_CCND1
    else:
       copynumber_CCND1 = "n/a"
    log2_ERBB2 = sample.getLog2FromGene(gene_positions["ERBB2"].chrom,gene_positions["ERBB2"].start, gene_positions["ERBB2"].end)
    if (log2_ERBB2 != None):
       copynumber_ERBB2 = 2.0 * 2.0 ** log2_ERBB2
    else:
       copynumber_ERBB2 = "n/a"
    log2_CCNE1 = sample.getLog2FromGene(gene_positions["CCNE1"].chrom,gene_positions["CCNE1"].start, gene_positions["CCNE1"].end)
    if (log2_CCNE1 != None):
       copynumber_CCNE1 = 2.0 * 2.0 ** log2_CCNE1
    else:
       copynumber_CCNE1 = "n/a"
        
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        MYC_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            MYC_and_CDKN2A_count += 1
        if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
            MYC_and_FGFR1_count += 1
        if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
            MYC_and_CCND1_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            MYC_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            MYC_and_CCNE1_count += 1
    if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
        FGFR1_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            FGFR1_and_CDKN2A_count += 1
        if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
            FGFR1_and_CCND1_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            FGFR1_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            FGFR1_and_CCNE1_count += 1
    if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
        CCND1_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            CCND1_and_CDKN2A_count += 1
        if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
            CCND1_and_ERBB2_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            CCND1_and_CCNE1_count += 1
    if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
        ERBB2_count += 1
        if log2_CDKN2A != None and log2_CDKN2A < -0.3:
            ERBB2_and_CDKN2A_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            ERBB2_and_CCNE1_count += 1
    if log2_CDKN2A != None and log2_CDKN2A < -0.3:
        CDKN2A_count += 1
        if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
            CDKN2A_and_CCNE1_count += 1
    if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
        CCNE1_count += 1
print ""
print " 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print "  MYC amplified: "+str(MYC_count)
print "  CDKNA2 deleted: "+str(CDKN2A_count)
print "  FGFR1 amplified: "+str(FGFR1_count)
print "  CCND1 amplified: "+str(CCND1_count)
print "  ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h
    
    
    Out[23]:
Leiserson: Log2-ratio > 0.9
Amplitude: copynumber > 4 (means: log2-ratio > 1)
Size:      smaller than 20Mbp, Log2-ratio > 0.2; log2-ratio > 0.2 higher than surrounding 20Mbp on either side
In [ ]:
    
size = list()
amplitude = list()
leiserson = list()
for sample in samples.values():
    if sample.checkFocalGeneAmp("MYC"):
        size.append(sample.id)
    log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
    if (log2_MYC != None):
       copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
       copynumber_MYC = "n/a"  
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        amplitude.append(sample.id)
    if log2_MYC > 0.9:
        leiserson.append(sample.id)
print "MYC amplification definition"
print "----------------------------"
print "MYC amplified defined by size and local amplitude: "+str(len(size))
print "MYC amplified defined by amplitude (Copynumber > 4): "+str(len(amplitude))
print "MYC amplified defined by Leiserson (Log2-Ratio > 0.9): "+str(len(leiserson))
plt.figure(figsize=(10,10))
matplotlib_venn.venn3([set(size),set(amplitude),set(leiserson)],set_labels=["Size", "Copynumber", "Leiserson"])
    
In [3]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    if sample.checkFocalGeneAmp("MYC"):
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_focal_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[3]:
    
In [13]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn4_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[13]:
    
In [9]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[9]:
    
In [8]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[8]:
    
In [7]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 6:
        source_sites.append(sample.primarysiteofdesease)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['blue','red','yellow','green','purple', 'navy', 'gray', 'black', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_cn6_tissue_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[7]:
    
In [6]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    copynumber_MYC = "n/a"
    log2_MYC = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
    if (log2_MYC != None):
        copynumber_MYC = 2.0 * 2.0 ** log2_MYC
    else:
        copynumber_MYC="n/a"
    if copynumber_MYC != "n/a" and copynumber_MYC > 4 and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_MYC_TP53_cn4_histological_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[6]:
    
In [5]:
    
source_sites = list()
for sample in samples.values():
    if not sample.clinical:
        continue
    if not sample.somatic_mutation_data:
        continue
    if sample.checkFocalGeneAmp("MYC") and "TP53" in sample.genes_affected:
        source_sites.append(sample.primarysiteofdesease+" "+sample.histologicaltype)
        
source_sites_series = pandas.Categorical(sorted(source_sites))
fig_sourcesites=source_sites_series.describe().counts.plot(kind='pie', figsize=(6, 6),colors=['red','yellow','green','purple', 'navy', 'gray', 'orange', 'yellowgreen'])
fig_sourcesites.set_ylabel('')
output1=fig_sourcesites.get_figure()
output1.savefig('PanCancer_focalMYC_TP53_histological_sites.png', dpi=200)
count = 0
s = "<table><tr><th>Site</th><th>Count</th><th>Frequency</th></tr>"
for i in source_sites_series.categories:
   s += "<tr><td>"+i+"</td><td>"+str(source_sites_series.describe()['counts'][count])+"</td><td>"+str(source_sites_series.describe()['freqs'][count])+"</td></tr>"
   count += 1
s += "</table>"
h = HTML(s);h
    
    Out[5]:
    
In [ ]: