Download data using firehose_get
In [1]:
import os
import glob
import subprocess
import pandas
Use run data of 2015_02_04
For this step, one needs the firehose_get binary from the Broad institute to be in the directory of the notebook files. Firehose_get can be downloaded here: https://confluence.broadinstitute.org/display/GDAC/Download
In [2]:
!./firehose_get -b -o BRCA.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3 stddata 2015_02_04 BRCA > BRCA.download.log
!mkdir ./BRCA
!tar xzf stddata__2015_02_04/BRCA/20150204/gdac.broadinstitute.org_BRCA.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015020400.0.0.tar.gz -C ./BRCA/
!mv BRCA/gdac.broadinstitute.org_BRCA.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015020400.0.0 BRCA/CNV
In [3]:
cnv_file = "BRCA/CNV/BRCA.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt"
CNV_INPUT = open(cnv_file, "r")
header = CNV_INPUT.readline()
body = CNV_INPUT.readlines()
actual_sample = ""
SAMPLE = ""
for line in body:
info = line.split()
if (info[0] != actual_sample):
actual_sample = info[0]
if SAMPLE != "":
SAMPLE.close()
SAMPLE = open("BRCA/CNV/"+actual_sample+".txt","w")
SAMPLE.write('\t'.join(header.split()[1:])+"\n")
SAMPLE.write('\t'.join(info[1:])+"\n")
else:
SAMPLE.write('\t'.join(info[1:])+"\n")
Samples where CNV substraction removed an entire chromosome will throw an error here
In [4]:
!mkdir BRCA/FocalOutput
file_list = glob.glob("BRCA/CNV/TCGA*.txt")
for input_file in file_list:
filename = os.path.basename(input_file)
#only use tumor files specified in the Barcode by TCGA-xx-xxxx-0xx-xxx-xxxxx-xx
if filename[13] == '0':
!cat FocalAmplifications_fromSNPArray_noChrY.R | R --slave --args $input_file BRCA/FocalOutput/$filename Breast 100 > tmp
In [5]:
!./firehose_get -b -o RSEM_genes_normalized stddata 2015_02_04 BRCA >> BRCA.download.log
In [6]:
!tar xzf stddata__2015_02_04/BRCA/20150204/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015020400.0.0.tar.gz -C ./BRCA/
!mv BRCA/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015020400.0.0 BRCA/RNASeq
In [7]:
expression_data=pandas.io.parsers.read_csv("BRCA/RNASeq/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt", header=0, skiprows=[1], sep="\t")
for column in expression_data.columns:
if column == "Hybridization REF":
continue
SAMPLE = open("BRCA/RNASeq/"+column+".txt","w")
SAMPLE.write("Gene\tRSEM normalized\n")
column_count = len(expression_data.index)
for i in range(0,column_count):
SAMPLE.write(str(expression_data['Hybridization REF'][i])+"\t"+str(expression_data[column][i])+"\n")
SAMPLE.close()
In [8]:
!./firehose_get -b -o Mutation_Packager_Calls stddata 2015_02_04 BRCA >> BRCA.download.log
In [9]:
!tar xzf stddata__2015_02_04/BRCA/20150204/gdac.broadinstitute.org_BRCA.Mutation_Packager_Calls.Level_3.2015020400.0.0.tar.gz -C ./BRCA/
!mv BRCA/gdac.broadinstitute.org_BRCA.Mutation_Packager_Calls.Level_3.2015020400.0.0 BRCA/SomaticMutations
In [10]:
!./firehose_get -b -o Clinical stddata 2015_02_04 BRCA >> BRCA.download.log
In [11]:
!tar xzf stddata__2015_02_04/BRCA/20150204/gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2015020400.0.0.tar.gz -C ./BRCA/
!tar xzf stddata__2015_02_04/BRCA/20150204/gdac.broadinstitute.org_BRCA.Clinical_Pick_Tier1.Level_4.2015020400.0.0.tar.gz -C ./BRCA/
!mv BRCA/gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2015020400.0.0 BRCA/Clinical
!mv BRCA/gdac.broadinstitute.org_BRCA.Clinical_Pick_Tier1.Level_4.2015020400.0.0 BRCA/Clinical
!mv BRCA/Clinical/gdac.broadinstitute.org_BRCA.Clinical_Pick_Tier1.Level_4.2015020400.0.0/BRCA.clin.merged.picked.txt BRCA/Clinical/BRCA.clin.merged.picked.txt
In [12]:
clinical_expand_data=pandas.io.parsers.read_csv("BRCA/Clinical/BRCA.clin.merged.txt", header=21, index_col=0, sep="\t")
clinical_picked_data=pandas.io.parsers.read_csv("BRCA/Clinical/BRCA.clin.merged.picked.txt", header=0, index_col=0, sep="\t")
for column in clinical_expand_data.columns:
SAMPLE = open("BRCA/Clinical/"+str(column).upper()+".txt","w")
for index in clinical_picked_data.index:
SAMPLE.write(index+"\t"+str(clinical_picked_data.loc[[index],[column]].values[0,0])+"\n")
for index in clinical_expand_data.index:
SAMPLE.write(index+"\t"+str(clinical_expand_data.loc[[index],[column]].values[0,0])+"\n")
SAMPLE.close()
In [12]: