In [1]:
import NotebookImport
from TCGA_analysis_BRCA_import import *
from IPython.display import HTML
In [2]:
# 1) check lengths of segments having copynumber > 4 in known focal amplification genes (Beroukhim et al, Nature 2010 "The landscape of somatic copy-number alteration across human cancers", Supp Table2 Known Targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
focal_sizes=list()
nonfocal_sizes = list()
for sample in samples.values():
for gene in beroukhim_genes:
if gene in gene_positions.keys():
log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and copynumber > 4:
focal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
if copynumber != "n/a" and copynumber < 4:
nonfocal_sizes.append(sample.getSizeFromPosition(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end))
fig, axs = plt.subplots(1,2,figsize=(12,3))
focal_sizes_series = pandas.Series(focal_sizes)
fig_focal=focal_sizes_series.plot(ax=axs[0],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[0].vlines(x=20000000, ymin=0, ymax=500, color='r')
axs[0].set_xlabel("Size of target genes at copynumber > 4")
axs[0].set_ylabel("Frequency")
#output=fig_focal.get_figure()
#plt.show()
#output.savefig('BRCA_sizes_copynumber_over4_known.png')
mean_focal = focal_sizes_series.mean()
std_focal = focal_sizes_series.std()
nonfocal_sizes_series = pandas.Series(nonfocal_sizes)
fig_nonfocal=nonfocal_sizes_series.plot(ax=axs[1],kind='hist', alpha=0.5, range=(0,50000000), y='Frequency', x='Size in 10Mbp')
axs[1].vlines(x=20000000, ymin=0, ymax=1800, color='r')
axs[1].set_xlabel("Size of target genes at copynumber < 4")
axs[1].set_ylabel("Frequency")
output1=fig_nonfocal.get_figure()
#plt.show()
output1.savefig('BRCA_sizes_copynumber_lower4_known.png', dpi=200)
mean_nonfocal = nonfocal_sizes_series.mean()
std_nonfocal = nonfocal_sizes_series.std()
print ""
print " 1) check focal size distribution for samples having copynumber > 4 in genes with known focal amplifications"
print " Segments copynumber > 4 and known focal amplifications:"
print " Mean: "+str(mean_focal)
print " Standard Deviation: "+str(std_focal)
print " ---------------------------------"
print " Segments copynumber < 4:"
print " Mean: "+str(mean_nonfocal)
print " Standard Deviation: "+str(std_nonfocal)
print " ---------------------------------"
In [3]:
# 2) correlation of lengths and gene expression of MYC
MYC_expression = list()
MYC_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
if sample.getRNASeqFromGene("MYC") != "n/a" and size != None:
MYC_expression.append(sample.getRNASeqFromGene("MYC"))
MYC_size.append(size)
MYC_expr_tuple={'MYC expression': MYC_expression,'MYC size': MYC_size}
MYC_expr_size=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_size.plot(kind='scatter', x='MYC expression', y='MYC size')
plt.hlines(y=20000000, xmin=-1000, xmax=15000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_size)
print ""
print " 2) correlation of lengths and gene expression of MYC"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [4]:
# 3) correlation of lengths and gene expression of CCND1
CCND1_expression = list()
CCND1_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
if sample.getRNASeqFromGene("CCND1") != "n/a" and size != None:
CCND1_expression.append(sample.getRNASeqFromGene("CCND1"))
CCND1_size.append(size)
CCND1_expr_tuple={'CCND1 expression': CCND1_expression,'CCND1 size': CCND1_size}
CCND1_expr_size=pandas.DataFrame(CCND1_expr_tuple)
fig_correlation= CCND1_expr_size.plot(kind='scatter', x='CCND1 expression', y='CCND1 size')
plt.hlines(y=20000000, xmin=-50000, xmax=250000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_expression,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_expression,CCND1_size)
print ""
print " 3) correlation of lengths and gene expression of CCND1"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [5]:
# 4) correlation of lengths and gene expression of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_expression = list()
gene_size = list()
for sample in samples.values():
for gene in beroukhim_genes:
size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
if sample.getRNASeqFromGene(gene) != "n/a" and size != None:
gene_expression.append(sample.getRNASeqFromGene(gene))
gene_size.append(size)
gene_expr_tuple={'Gene expression': gene_expression,'Gene size': gene_size}
gene_expr_size=pandas.DataFrame(gene_expr_tuple)
fig_correlation= gene_expr_size.plot(kind='scatter', x='Gene expression', y='Gene size')
plt.hlines(y=20000000, xmin=-50000, xmax=400000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(gene_expression,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_expression,gene_size)
print ""
print " 4) correlation of lengths and gene expression of Beroukhim targets"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [6]:
# 5) correlation of lengths and copynumber of MYC
MYC_copynumber = list()
MYC_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['MYC'].chrom, gene_positions['MYC'].start, gene_positions['MYC'].end)
log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
MYC_copynumber.append(copynumber)
MYC_size.append(size)
MYC_copynumber_tuple={'MYC copynumber': MYC_copynumber,'MYC size': MYC_size}
MYC_copynumber_size=pandas.DataFrame(MYC_copynumber_tuple)
fig_correlation= MYC_copynumber_size.plot(kind='scatter', x='MYC copynumber', y='MYC size')
ax = plt.gca()
ax.set_xlim(0,20)
ax.set_ylim(-10000000,150000000)
plt.hlines(y=20000000, xmin=0, xmax=40, color='r')
plt.vlines(x=4, ymin=0, ymax=150000000, color='g')
output1=fig_correlation.get_figure()
output1.savefig('BRCA_copynumber_vs_size_MYC.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(MYC_copynumber,MYC_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_copynumber,MYC_size)
print ""
print " 5) correlation of lengths and copynumber of MYC"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [7]:
# 6) correlation of lengths and copynumber of CCND1
CCND1_copynumber = list()
CCND1_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['CCND1'].chrom, gene_positions['CCND1'].start, gene_positions['CCND1'].end)
log2 = sample.getLog2FromGene(gene_positions['CCND1'].chrom,gene_positions['CCND1'].start, gene_positions['CCND1'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
CCND1_copynumber.append(copynumber)
CCND1_size.append(size)
CCND1_copynumber_tuple={'CCND1 copynumber': CCND1_copynumber,'CCND1 size': CCND1_size}
CCND1_copynumber_size=pandas.DataFrame(CCND1_copynumber_tuple)
fig_correlation= CCND1_copynumber_size.plot(kind='scatter', x='CCND1 copynumber', y='CCND1 size')
ax = plt.gca()
ax.set_xlim(0,20)
ax.set_ylim(-10000000,150000000)
plt.hlines(y=20000000, xmin=0, xmax=40, color='r')
plt.vlines(x=4, ymin=0, ymax=150000000, color='g')
output1=fig_correlation.get_figure()
output1.savefig('BRCA_copynumber_vs_size_CCND1.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_copynumber,CCND1_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_copynumber,CCND1_size)
print ""
print " 6) correlation of lengths and copynumber of CCND1"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [8]:
# 6) correlation of lengths and copynumber of EGFR
EGFR_copynumber = list()
EGFR_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['EGFR'].chrom, gene_positions['EGFR'].start, gene_positions['EGFR'].end)
log2 = sample.getLog2FromGene(gene_positions['EGFR'].chrom,gene_positions['EGFR'].start, gene_positions['EGFR'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
EGFR_copynumber.append(copynumber)
EGFR_size.append(size)
EGFR_copynumber_tuple={'EGFR copynumber': EGFR_copynumber,'EGFR size': EGFR_size}
EGFR_copynumber_size=pandas.DataFrame(EGFR_copynumber_tuple)
fig_correlation= EGFR_copynumber_size.plot(kind='scatter', x='EGFR copynumber', y='EGFR size')
ax = plt.gca()
ax.set_xlim(0,20)
ax.set_ylim(-10000000,150000000)
plt.hlines(y=20000000, xmin=0, xmax=40, color='r')
plt.vlines(x=4, ymin=0, ymax=150000000, color='g')
output1=fig_correlation.get_figure()
output1.savefig('BRCA_copynumber_vs_size_EGFR.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(EGFR_copynumber,EGFR_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(EGFR_copynumber,EGFR_size)
print ""
print " 6) correlation of lengths and copynumber of EGFR"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [9]:
# 6) correlation of lengths and copynumber of ERBB2
ERBB2_copynumber = list()
ERBB2_size = list()
for sample in samples.values():
size = sample.getSizeFromPosition(gene_positions['ERBB2'].chrom, gene_positions['ERBB2'].start, gene_positions['ERBB2'].end)
log2 = sample.getLog2FromGene(gene_positions['ERBB2'].chrom,gene_positions['ERBB2'].start, gene_positions['ERBB2'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
ERBB2_copynumber.append(copynumber)
ERBB2_size.append(size)
ERBB2_copynumber_tuple={'ERBB2 copynumber': ERBB2_copynumber,'ERBB2 size': ERBB2_size}
ERBB2_copynumber_size=pandas.DataFrame(ERBB2_copynumber_tuple)
fig_correlation= ERBB2_copynumber_size.plot(kind='scatter', x='ERBB2 copynumber', y='ERBB2 size')
ax = plt.gca()
ax.set_xlim(0,20)
ax.set_ylim(-10000000,150000000)
plt.hlines(y=20000000, xmin=0, xmax=40, color='r')
plt.vlines(x=4, ymin=0, ymax=150000000, color='g')
output1=fig_correlation.get_figure()
output1.savefig('BRCA_copynumber_vs_size_ERBB2.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(ERBB2_copynumber,ERBB2_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(ERBB2_copynumber,ERBB2_size)
print ""
print " 6) correlation of lengths and copynumber of ERBB2"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [10]:
# 7) correlation of lengths and copynumber of Beroukhim targets
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
gene_copynumber = list()
gene_size = list()
for sample in samples.values():
for gene in beroukhim_genes:
size = sample.getSizeFromPosition(gene_positions[gene].chrom, gene_positions[gene].start, gene_positions[gene].end)
log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if copynumber != "n/a" and size != None:
gene_copynumber.append(copynumber)
gene_size.append(size)
gene_copynumber_tuple={'Gene copynumber': gene_copynumber,'Gene size': gene_size}
gene_copynumber_size=pandas.DataFrame(gene_copynumber_tuple)
fig_correlation= gene_copynumber_size.plot(kind='scatter', x='Gene copynumber', y='Gene size')
ax = plt.gca()
ax.set_xlim(0,20)
ax.set_ylim(-10000000,150000000)
plt.hlines(y=20000000, xmin=0, xmax=40, color='r')
plt.vlines(x=4, ymin=0, ymax=150000000, color='g')#output1=fig_correlation.get_figure()
#plt.show()
output1.savefig('BRCA_copynumber_vs_size_beroukhim_targets.png', dpi=200)
corr_coeff, p_value = scipy.stats.pearsonr(gene_copynumber,gene_size)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_copynumber,gene_size)
print ""
print " 7) correlation of lengths and copynumber of Beroukhim targets"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [11]:
# 8) correlation of lengths and gene expression of MYC
MYC_expression = list()
MYC_copynumber = list()
for sample in samples.values():
log2 = sample.getLog2FromGene(gene_positions['MYC'].chrom,gene_positions['MYC'].start, gene_positions['MYC'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if sample.getRNASeqFromGene("MYC") != "n/a" and copynumber != "n/a":
MYC_expression.append(sample.getRNASeqFromGene("MYC"))
MYC_copynumber.append(copynumber)
MYC_expr_tuple={'MYC expression': MYC_expression,'MYC copynumber': MYC_copynumber}
MYC_expr_copynumber=pandas.DataFrame(MYC_expr_tuple)
fig_correlation= MYC_expr_copynumber.plot(kind='scatter', x='MYC expression', y='MYC copynumber')
plt.hlines(y=2, xmin=-1000, xmax=30000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(MYC_expression,MYC_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(MYC_expression,MYC_copynumber)
print ""
print " 8) correlation of copynumber and gene expression of MYC"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [12]:
# 8) correlation of lengths and gene expression of CCND1
CCND1_expression = list()
CCND1_copynumber = list()
for sample in samples.values():
log2 = sample.getLog2FromGene(gene_positions['CCND1'].chrom,gene_positions['CCND1'].start, gene_positions['CCND1'].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if sample.getRNASeqFromGene("CCND1") != "n/a" and copynumber != "n/a":
CCND1_expression.append(sample.getRNASeqFromGene("CCND1"))
CCND1_copynumber.append(copynumber)
CCND1_expr_tuple={'CCND1 expression': CCND1_expression,'CCND1 copynumber': CCND1_copynumber}
CCND1_expr_copynumber=pandas.DataFrame(CCND1_expr_tuple)
fig_correlation= CCND1_expr_copynumber.plot(kind='scatter', x='CCND1 expression', y='CCND1 copynumber')
plt.hlines(y=2, xmin=-1000, xmax=200000, color='r')
#output1=fig_correlation.get_figure()
#plt.show()
#output1.savefig('BRCA_expression_vs_size_MYC.png')
corr_coeff, p_value = scipy.stats.pearsonr(CCND1_expression,CCND1_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(CCND1_expression,CCND1_copynumber)
print ""
print " 8) correlation of copynumber and gene expression of CCND1"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [13]:
# 8) correlation of lengths and gene expression of Beroukhim targets
gene_expression = list()
gene_copynumber = list()
beroukhim_genes=["MYC","CCND1","ERBB2","CDK4","NKX2-1","MDM2","EGFR","MCL1","FGFR1","KRAS","CCNE1","CRKL","HMGA2","TERT","PRKCI","IGF1R","MYCL","MYCN","CDK6","BCL2L1","MYB","MET","JUN","BIRC2","YAP1","PDGFRA","KIT","PIK3CA","MDM4","AR"]
for sample in samples.values():
for gene in beroukhim_genes:
log2 = sample.getLog2FromGene(gene_positions[gene].chrom,gene_positions[gene].start, gene_positions[gene].end)
if (log2 != None):
copynumber = 2.0 * 2.0 ** log2
else:
copynumber = "n/a"
if sample.getRNASeqFromGene(gene) != "n/a" and copynumber != "n/a":
gene_expression.append(sample.getRNASeqFromGene(gene))
gene_copynumber.append(copynumber)
gene_expr_tuple={'expression': gene_expression,'copynumber': gene_copynumber}
gene_expr_copynumber=pandas.DataFrame(gene_expr_tuple)
fig_correlation= gene_expr_copynumber.plot(kind='scatter', x='expression', y='copynumber')
plt.hlines(y=2, xmin=-1000, xmax=300000, color='r')
corr_coeff, p_value = scipy.stats.pearsonr(gene_expression,gene_copynumber)
corr_coeff_spear, p_value_spear = scipy.stats.spearmanr(gene_expression,gene_copynumber)
print ""
print " 8) correlation of copynumber and gene expression of Beroukhim targets"
print " Pearson Correlation coefficient: "+str(corr_coeff)
print " P-value: "+str(p_value)
print " -----------------------"
print " Spearman Correlation coefficient: "+str(corr_coeff_spear)
print " P-value: "+str(p_value_spear)
In [14]:
# 7) Check occurrence of focal amplifications of Beroukhim targets
beroukhim_genes={"MYC" : 0,"CCND1" : 0,"ERBB2" : 0,"CDK4" : 0,"NKX2-1" : 0,"MDM2" : 0,"EGFR" : 0,"MCL1" : 0,"FGFR1" : 0,"KRAS" : 0,"CCNE1" : 0,"CRKL" : 0,"HMGA2" : 0,"TERT" : 0,"PRKCI" : 0,"IGF1R" : 0,"MYCL" : 0,"MYCN" : 0,"CDK6" : 0,"BCL2L1" : 0,"MYB" : 0,"MET" : 0,"JUN" : 0,"BIRC2" : 0,"YAP1" : 0,"PDGFRA" : 0,"KIT" : 0,"PIK3CA" : 0,"MDM4" : 0,"AR" : 0}
for sample in samples.values():
for gene in beroukhim_genes.keys():
if sample.checkFocalGeneAmp(gene):
beroukhim_genes[gene] += 1
focal_amp_series=pandas.Series(beroukhim_genes)
focal_amp_series.sort(ascending=False)
fig_correlation= focal_amp_series.plot(kind='bar')
print ""
print " 7) Check occurrence of focal amplifications of Beroukhim targets"
print ""
In [15]:
# 8) Get 50 most frequent genes in focal events
gene_amp_frequency = dict()
for sample in samples.values():
for gene in sample.listFocalAmpGenes():
if gene in gene_amp_frequency.keys():
gene_amp_frequency[gene] += 1
else:
gene_amp_frequency[gene] = 1
gene_del_frequency = dict()
for sample in samples.values():
for gene in sample.listFocalDelGenes():
if gene in gene_del_frequency.keys():
gene_del_frequency[gene] += 1
else:
gene_del_frequency[gene] = 1
print ""
print " 8) Get 20 most frequent genes in focal amplifications/deletions"
print " Amplifications:"
count = 0
for gene in sorted(gene_amp_frequency, key=gene_amp_frequency.get, reverse=True):
count += 1
if (gene in gene_positions.keys()):
print "Gene: "+gene+" Freq: "+str(gene_amp_frequency[gene])+" Pos: "+gene_positions[gene].returnPos()
else:
print "Gene: "+gene+" Freq: "+str(gene_amp_frequency[gene])
if (count > 19):
break
count = 0
print ""
print " Deletions:"
for gene in sorted(gene_del_frequency, key=gene_del_frequency.get, reverse=True):
count += 1
if (gene in gene_positions.keys()):
print "Gene: "+gene+" Freq: "+str(gene_del_frequency[gene])+" Pos: "+gene_positions[gene].returnPos()
else:
print "Gene: "+gene+" Freq: "+str(gene_del_frequency[gene])
if (count > 19):
break
Numbers in diagonal of the table represent all gene amplifications found
In [16]:
# 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0
for sample in samples.values():
if sample.checkFocalGeneAmp("MYC"):
MYC_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
MYC_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("FGFR1"):
MYC_and_FGFR1_count += 1
if sample.checkFocalGeneAmp("CCND1"):
MYC_and_CCND1_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
MYC_and_ERBB2_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
MYC_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("FGFR1"):
FGFR1_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
FGFR1_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("CCND1"):
FGFR1_and_CCND1_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
FGFR1_and_ERBB2_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
FGFR1_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("CCND1"):
CCND1_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
CCND1_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
CCND1_and_ERBB2_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
CCND1_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("ERBB2"):
ERBB2_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
ERBB2_and_CDKN2A_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
ERBB2_and_CCNE1_count += 1
if sample.checkFocalGeneDel("CDKN2A"):
CDKN2A_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
CDKN2A_and_CCNE1_count += 1
if sample.checkFocalGeneAmp("CCNE1"):
CCNE1_count += 1
print ""
print " 9) Check for cooccurence of focal amplifications in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print " MYC amplified: "+str(MYC_count)
print " CDKNA2 deleted: "+str(CDKN2A_count)
print " FGFR1 amplified: "+str(FGFR1_count)
print " CCND1 amplified: "+str(CCND1_count)
print " ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h
Out[16]:
In [17]:
# 10) Check for cooccurence of amplifications (copynumber > 4) MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1
MYC_count = 0
MYC_and_CDKN2A_count = 0
MYC_and_FGFR1_count = 0
MYC_and_CCND1_count = 0
MYC_and_ERBB2_count = 0
MYC_and_CCNE1_count = 0
FGFR1_count = 0
FGFR1_and_CDKN2A_count = 0
FGFR1_and_CCND1_count = 0
FGFR1_and_ERBB2_count = 0
FGFR1_and_CCNE1_count = 0
CCND1_count = 0
CCND1_and_CDKN2A_count = 0
CCND1_and_ERBB2_count = 0
CCND1_and_CCNE1_count = 0
ERBB2_count = 0
ERBB2_and_CDKN2A_count = 0
ERBB2_and_CCNE1_count = 0
CDKN2A_count = 0
CDKN2A_and_CCNE1_count = 0
CCNE1_count = 0
for sample in samples.values():
log2_MYC = sample.getLog2FromGene(gene_positions["MYC"].chrom,gene_positions["MYC"].start, gene_positions["MYC"].end)
if (log2_MYC != None):
copynumber_MYC = 2.0 * 2.0 ** log2_MYC
else:
copynumber_MYC = "n/a"
log2_FGFR1 = sample.getLog2FromGene(gene_positions["FGFR1"].chrom,gene_positions["FGFR1"].start, gene_positions["FGFR1"].end)
if (log2_FGFR1 != None):
copynumber_FGFR1 = 2.0 * 2.0 ** log2_FGFR1
else:
copynumber_FGFR1 = "n/a"
log2_CDKN2A = sample.getLog2FromGene(gene_positions["CDKN2A"].chrom,gene_positions["CDKN2A"].start, gene_positions["CDKN2A"].end)
log2_CCND1 = sample.getLog2FromGene(gene_positions["CCND1"].chrom,gene_positions["CCND1"].start, gene_positions["CCND1"].end)
if (log2_CCND1 != None):
copynumber_CCND1 = 2.0 * 2.0 ** log2_CCND1
else:
copynumber_CCND1 = "n/a"
log2_ERBB2 = sample.getLog2FromGene(gene_positions["ERBB2"].chrom,gene_positions["ERBB2"].start, gene_positions["ERBB2"].end)
if (log2_ERBB2 != None):
copynumber_ERBB2 = 2.0 * 2.0 ** log2_ERBB2
else:
copynumber_ERBB2 = "n/a"
log2_CCNE1 = sample.getLog2FromGene(gene_positions["CCNE1"].chrom,gene_positions["CCNE1"].start, gene_positions["CCNE1"].end)
if (log2_CCNE1 != None):
copynumber_CCNE1 = 2.0 * 2.0 ** log2_CCNE1
else:
copynumber_CCNE1 = "n/a"
if copynumber_MYC != "n/a" and copynumber_MYC > 4:
MYC_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
MYC_and_CDKN2A_count += 1
if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
MYC_and_FGFR1_count += 1
if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
MYC_and_CCND1_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
MYC_and_ERBB2_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
MYC_and_CCNE1_count += 1
if copynumber_FGFR1 != "n/a" and copynumber_FGFR1 > 4:
FGFR1_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
FGFR1_and_CDKN2A_count += 1
if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
FGFR1_and_CCND1_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
FGFR1_and_ERBB2_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
FGFR1_and_CCNE1_count += 1
if copynumber_CCND1 != "n/a" and copynumber_CCND1 > 4:
CCND1_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
CCND1_and_CDKN2A_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
CCND1_and_ERBB2_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
CCND1_and_CCNE1_count += 1
if copynumber_ERBB2 != "n/a" and copynumber_ERBB2 > 4:
ERBB2_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
ERBB2_and_CDKN2A_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
ERBB2_and_CCNE1_count += 1
if log2_CDKN2A != None and log2_CDKN2A < -0.3:
CDKN2A_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
CDKN2A_and_CCNE1_count += 1
if copynumber_CCNE1 != "n/a" and copynumber_CCNE1 > 4:
CCNE1_count += 1
print ""
print " 9) Check for cooccurence of amplifications (copynumber > 4) in MYC, CDKN2A, FGFR1, CCND1, ERBB2, CCNE1"
print ""
print " MYC amplified: "+str(MYC_count)
print " CDKNA2 deleted: "+str(CDKN2A_count)
print " FGFR1 amplified: "+str(FGFR1_count)
print " CCND1 amplified: "+str(CCND1_count)
print " ERBB2 amplified: "+str(ERBB2_count)
print ""
s = "<table><tr><th></th><th>MYC</th><th>CDKN2A</th><th>FGFR1</th><th>CCND1</th><th>ERBB2</th><th>CCNE1</th></tr>"
s += "<tr><th>MYC</th><td><b>"+str(MYC_count)+"</b></td><td>"+str(MYC_and_CDKN2A_count)+"</td><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(MYC_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CDKN2A</th><td>"+str(MYC_and_CDKN2A_count)+"</td><td><b>"+str(CDKN2A_count)+"</b></td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>FGFR1</th><td>"+str(MYC_and_FGFR1_count)+"</td><td>"+str(FGFR1_and_CDKN2A_count)+"</td><td><b>"+str(FGFR1_count)+"</b></td><td>"+str(FGFR1_and_CCND1_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCND1</th><td>"+str(MYC_and_CCND1_count)+"</td><td>"+str(CCND1_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_CCND1_count)+"</td><td><b>"+str(CCND1_count)+"</b></td><td>"+str(CCND1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>ERBB2</th><td>"+str(MYC_and_ERBB2_count)+"</td><td>"+str(ERBB2_and_CDKN2A_count)+"</td><td>"+str(FGFR1_and_ERBB2_count)+"</td><td>"+str(CCND1_and_ERBB2_count)+"</td><td><b>"+str(ERBB2_count)+"</b></td><td>"+str(ERBB2_and_CCNE1_count)+"</td></tr>"
s += "<tr><th>CCNE1</th><td>"+str(MYC_and_CCNE1_count)+"</td><td>"+str(CDKN2A_and_CCNE1_count)+"</td><td>"+str(FGFR1_and_CCNE1_count)+"</td><td>"+str(CCND1_and_CCNE1_count)+"</td><td>"+str(ERBB2_and_CCNE1_count)+"</td><td><b>"+str(CCNE1_count)+"</b></td></tr>"
s += "</table>"
h = HTML(s);h
Out[17]:
In [17]: