TCGA Data analysis Breast Cancer Import Data

Check for correlation between MYC amplified tumor samples and TP53 mutations MYC amplification is determined by two different ways:

  1. MYC is in a segment having a copynumber > 4 (this likely excludes chromosome-arm level events)
  2. "Our" definition of a focal amplification
    • segment is small (smaller than 20Mbp)
    • segment covering gene is increased (log2-ratio > 0.2)
    • segment is focal (log2-ratio of segment is at least 0.2 above weighted mean of neighbouring 20Mbp)
    • segment does not contain segmental duplications (less than 50%)
    • segment doesn't overlap with known CNV from Database of genomic variants (start and endpoint are within 100Kbp from variants start and endpoint)

Import Packages, Modules and Classes


In [1]:
import glob
import sys
from TCGA_sample import TCGA_sample
from Focal_amplification import Focal_amplification
from CNV_segment import CNV_segment
from Gene import Gene
import pandas
import numpy
import os
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline

Define File names for BRCA samples and genes


In [2]:
gene_positions_file = "./Ref/genes_unique.txt"
focal_directory = "./BRCA/FocalOutput/"
rnaseq_directory = "./BRCA/RNASeq/"
cnv_directory = "./BRCA/CNV/"
somatic_directory = "./BRCA/SomaticMutations/"
clinical_directory="./BRCA/Clinical/"

Load Gene positions


In [3]:
gene_positions = dict()
GENES = open(gene_positions_file, "r")
header = GENES.readline()
body = GENES.readlines()
for line in body:
  info = line.split("\t")
  tmp_gene = Gene(info[4],info[0], info[1], info[2], info[3])
  gene_positions[info[4]] = tmp_gene

Define samples and load output of focal amplification calling (our definition)

We can only use samples where Focal Amplification data (hence CNV data) are available


In [4]:
print "Loading Focal Amplification data"
samples = dict()
files = glob.glob(focal_directory+"*.csv")
for focal in files: 
  sample_id = os.path.basename(focal)[:16]
  if sample_id in TCGA_sample.sample_ids:
     print "Sample ID already exists"
     continue
  sample = TCGA_sample(sample_id)
  sample.loadFocalOutput(focal)
  samples[sample_id]=sample


Loading Focal Amplification data

Load RNASeq Data


In [5]:
print "Loading RNASeq data"
files = glob.glob(rnaseq_directory+"TCGA*.txt")
for rna_file in files: 
  sample_id = os.path.basename(rna_file)[:16]
  if sample_id in samples.keys():
    samples[sample_id].loadRNASeq(rna_file)


Loading RNASeq data

Load RNASeq Data of controls


In [ ]:
print "Loading RNASeq data of controls"
controls = dict()
files = glob.glob(rnaseq_directory+"TCGA*.txt")
for rna_file in files:
   sample_id = os.path.basename(rna_file)[:16]  
   if sample_id[13] == '1':
        control = TCGA_sample(sample_id)
        control.loadRNASeq(rna_file)
        controls[sample_id]=control

Load raw CNV data


In [6]:
print "Loading CNV Data"
files = glob.glob(cnv_directory+"TCGA*.txt")
for cnv_file in files: 
  sample_id = os.path.basename(cnv_file)[:16]
  if sample_id in samples.keys():
    samples[sample_id].loadCNVData(cnv_file)


Loading CNV Data

Load Somatic Mutations calls


In [20]:
print "Loading Somatic Mutation Data"
files = glob.glob(somatic_directory+"TCGA*.maf.txt")
for maf_file in files: 
   sample_id = os.path.basename(maf_file)[:15]+"A"
   if sample_id in samples.keys():
      samples[sample_id].loadSomaticMutation(maf_file) 
   elif sample_id[:-1]+"B" in samples.keys():
      samples[sample_id[:-1]+"B"].loadSomaticMutation(maf_file)
   else:
      print sample_id+" not found"


Loading Somatic Mutation Data
TCGA-A8-A07C-01A not found
TCGA-BH-A0HN-01A not found
TCGA-BH-A0B8-01A not found
TCGA-BH-A0HF-01A not found
TCGA-A7-A4SC-01A not found
TCGA-AR-A0TU-01A not found
TCGA-AR-A1AT-01A not found
TCGA-E2-A1LS-01A not found
TCGA-A2-A0CZ-01A not found
TCGA-BH-A0HL-01A not found
TCGA-AN-A0G0-01A not found
TCGA-BH-A0B1-01A not found
TCGA-B6-A0I8-01A not found
TCGA-B6-A0I6-01A not found

Load Clinical Data from Biotab File


In [5]:
print "Loading Clinical Data"
for sample_id in samples.keys():
    clinical_file = clinical_directory+sample_id[:12]+".txt"
    if os.path.isfile(clinical_file):
       samples[sample_id].loadClinicalData(clinical_file)


Loading Clinical Data

Check if loading worked and get statistics


In [6]:
focal_samples = 0
cnv_samples = 0
rnaseq_samples = 0
somatic_mutation_samples = 0
clinical_samples = 0

for sample in samples.values():
    if sample.focal_amplification_data:
        focal_samples += 1
    if sample.CNV_data:
        cnv_samples += 1
    if sample.rnaseq_data:
        rnaseq_samples += 1
    if sample.somatic_mutation_data:
        somatic_mutation_samples += 1
    if sample.clinical:
        clinical_samples += 1
        
sample_count = len(samples)
print "Samples: "+str(sample_count)
print "  --Focal Data: "+str(focal_samples)
print "  --CNV:        "+str(cnv_samples)
print "  --RNASeq:     "+str(rnaseq_samples)
print "  --Somatic:    "+str(somatic_mutation_samples)
print "  --Clinical:   "+str(clinical_samples)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-6-749e859ce8f6> in <module>()
     16     if sample.clinical:
     17         clinical_samples += 1
---> 18         if sample.daystodeath != "nan":
     19             daystodeath += 1
     20 sample_count = len(samples)

AttributeError: TCGA_sample instance has no attribute 'daystodeath'

In [ ]: