Check for correlation between MYC amplified tumor samples and TP53 mutations MYC amplification is determined by two different ways:
In [1]:
    
import glob
import sys
from TCGA_sample import TCGA_sample
from Focal_amplification import Focal_amplification
from CNV_segment import CNV_segment
from Gene import Gene
import pandas
import numpy
import os
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline
    
In [2]:
    
gene_positions_file = "./Ref/genes_unique.txt"
focal_directory = "./PANCANCER/FocalOutput/"
rnaseq_directory = "./PANCANCER/RNASeq/"
cnv_directory = "./PANCANCER/CNV/"
somatic_directory = "./PANCANCER/SomaticMutations/"
clinical_directory="./PANCANCER/Clinical/"
    
In [3]:
    
gene_positions = dict()
GENES = open(gene_positions_file, "r")
header = GENES.readline()
body = GENES.readlines()
for line in body:
  info = line.split("\t")
  tmp_gene = Gene(info[4],info[0], info[1], info[2], info[3])
  gene_positions[info[4]] = tmp_gene
    
We can only use samples where Focal Amplification data (hence CNV data) are available
In [4]:
    
print "Loading Focal Amplification data"
samples = dict()
files = glob.glob(focal_directory+"*.csv")
for focal in files: 
  sample_id = os.path.basename(focal)[:16]
  if sample_id in TCGA_sample.sample_ids:
     print "Sample ID already exists"
     continue
  sample = TCGA_sample(sample_id)
  sample.loadFocalOutput(focal)
  samples[sample_id]=sample
    
    
In [5]:
    
print "Loading RNASeq data"
files = glob.glob(rnaseq_directory+"TCGA*.txt")
for rna_file in files: 
  sample_id = os.path.basename(rna_file)[:16]
  if sample_id in samples.keys():
    samples[sample_id].loadRNASeq(rna_file)
    
    
In [ ]:
    
print "Loading RNASeq data of controls"
controls = dict()
files = glob.glob(rnaseq_directory+"TCGA*.txt")
for rna_file in files:
   sample_id = os.path.basename(focal)[:16]
   if sample_id[13] == '1':
        control = TCGA_sample(sample_id)
        control.loadRNASeq(rna_file)
        controls[sample_id]=control
    
In [6]:
    
print "Loading CNV Data"
files = glob.glob(cnv_directory+"TCGA*.txt")
for cnv_file in files: 
  sample_id = os.path.basename(cnv_file)[:16]
  if sample_id in samples.keys():
    samples[sample_id].loadCNVData(cnv_file)
    
    
In [7]:
    
print "Loading Somatic Mutation Data"
files = glob.glob(somatic_directory+"TCGA*.maf.txt")
somatic_not_found = 0
for maf_file in files: 
   sample_id = os.path.basename(maf_file)[:15]+"A"
   if sample_id in samples.keys():
      samples[sample_id].loadSomaticMutation(maf_file) 
   elif sample_id[:-1]+"B" in samples.keys():
      samples[sample_id[:-1]+"B"].loadSomaticMutation(maf_file)
   else:
      somatic_not_found += 1
print str(somatic_not_found)+" samples have somatic but no CNV data"
    
    
In [8]:
    
print "Loading Clinical Data"
for sample_id in samples.keys():
    clinical_file = clinical_directory+sample_id[:12]+".txt"
    if os.path.isfile(clinical_file):
       samples[sample_id].loadClinicalData(clinical_file)
    
    
In [9]:
    
focal_samples = 0
cnv_samples = 0
rnaseq_samples = 0
somatic_mutation_samples = 0
clinical_samples = 0
for sample in samples.values():
    if sample.focal_amplification_data:
        focal_samples += 1
    if sample.CNV_data:
        cnv_samples += 1
    if sample.rnaseq_data:
        rnaseq_samples += 1
    if sample.somatic_mutation_data:
        somatic_mutation_samples += 1
    if sample.clinical:
        clinical_samples += 1
sample_count = len(samples)
print "Samples: "+str(sample_count)
print "  --Focal Data: "+str(focal_samples)
print "  --CNV:        "+str(cnv_samples)
print "  --RNASeq:     "+str(rnaseq_samples)
print "  --Somatic:    "+str(somatic_mutation_samples)
print "  --Clinical:   "+str(clinical_samples)
    
    
In [ ]: