Check for correlation between MYC amplified tumor samples and TP53 mutations MYC amplification is determined by two different ways:
In [1]:
import glob
import sys
from TCGA_sample import TCGA_sample
from Focal_amplification import Focal_amplification
from CNV_segment import CNV_segment
from Gene import Gene
import pandas
import numpy
import os
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
gene_positions_file = "./Ref/genes_unique.txt"
focal_directory = "./PANCANCER/FocalOutput/"
rnaseq_directory = "./PANCANCER/RNASeq/"
cnv_directory = "./PANCANCER/CNV/"
somatic_directory = "./PANCANCER/SomaticMutations/"
clinical_directory="./PANCANCER/Clinical/"
In [3]:
gene_positions = dict()
GENES = open(gene_positions_file, "r")
header = GENES.readline()
body = GENES.readlines()
for line in body:
info = line.split("\t")
tmp_gene = Gene(info[4],info[0], info[1], info[2], info[3])
gene_positions[info[4]] = tmp_gene
We can only use samples where Focal Amplification data (hence CNV data) are available
In [4]:
print "Loading Focal Amplification data"
samples = dict()
files = glob.glob(focal_directory+"*.csv")
for focal in files:
sample_id = os.path.basename(focal)[:16]
if sample_id in TCGA_sample.sample_ids:
print "Sample ID already exists"
continue
sample = TCGA_sample(sample_id)
sample.loadFocalOutput(focal)
samples[sample_id]=sample
In [5]:
print "Loading RNASeq data"
files = glob.glob(rnaseq_directory+"TCGA*.txt")
for rna_file in files:
sample_id = os.path.basename(rna_file)[:16]
if sample_id in samples.keys():
samples[sample_id].loadRNASeq(rna_file)
In [ ]:
print "Loading RNASeq data of controls"
controls = dict()
files = glob.glob(rnaseq_directory+"TCGA*.txt")
for rna_file in files:
sample_id = os.path.basename(focal)[:16]
if sample_id[13] == '1':
control = TCGA_sample(sample_id)
control.loadRNASeq(rna_file)
controls[sample_id]=control
In [6]:
print "Loading CNV Data"
files = glob.glob(cnv_directory+"TCGA*.txt")
for cnv_file in files:
sample_id = os.path.basename(cnv_file)[:16]
if sample_id in samples.keys():
samples[sample_id].loadCNVData(cnv_file)
In [7]:
print "Loading Somatic Mutation Data"
files = glob.glob(somatic_directory+"TCGA*.maf.txt")
somatic_not_found = 0
for maf_file in files:
sample_id = os.path.basename(maf_file)[:15]+"A"
if sample_id in samples.keys():
samples[sample_id].loadSomaticMutation(maf_file)
elif sample_id[:-1]+"B" in samples.keys():
samples[sample_id[:-1]+"B"].loadSomaticMutation(maf_file)
else:
somatic_not_found += 1
print str(somatic_not_found)+" samples have somatic but no CNV data"
In [8]:
print "Loading Clinical Data"
for sample_id in samples.keys():
clinical_file = clinical_directory+sample_id[:12]+".txt"
if os.path.isfile(clinical_file):
samples[sample_id].loadClinicalData(clinical_file)
In [9]:
focal_samples = 0
cnv_samples = 0
rnaseq_samples = 0
somatic_mutation_samples = 0
clinical_samples = 0
for sample in samples.values():
if sample.focal_amplification_data:
focal_samples += 1
if sample.CNV_data:
cnv_samples += 1
if sample.rnaseq_data:
rnaseq_samples += 1
if sample.somatic_mutation_data:
somatic_mutation_samples += 1
if sample.clinical:
clinical_samples += 1
sample_count = len(samples)
print "Samples: "+str(sample_count)
print " --Focal Data: "+str(focal_samples)
print " --CNV: "+str(cnv_samples)
print " --RNASeq: "+str(rnaseq_samples)
print " --Somatic: "+str(somatic_mutation_samples)
print " --Clinical: "+str(clinical_samples)
In [ ]: