Download data using firehose_get
In [12]:
import os
import glob
import subprocess
import pandas
Use run data of 2014_02_15
In [7]:
!./firehose_get -b -o segmented_scna_minus_germline_cnv_hg19__seg.Level_3 stddata 2014_02_15 PANCAN12 > PANCANCER.download.log
In [9]:
!mkdir ./PANCANCER
!tar xzf stddata__2014_02_15/PANCAN12/20140215/gdac.broadinstitute.org_PANCAN12.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2014021500.0.0.tar.gz -C ./PANCANCER/
!mv PANCANCER/gdac.broadinstitute.org_PANCAN12.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2014021500.0.0 PANCANCER/CNV
In [10]:
cnv_file = "PANCANCER/CNV/PANCAN12.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt"
CNV_INPUT = open(cnv_file, "r")
header = CNV_INPUT.readline()
body = CNV_INPUT.readlines()
actual_sample = ""
SAMPLE = ""
for line in body:
info = line.split()
if (info[0] != actual_sample):
actual_sample = info[0]
if SAMPLE != "":
SAMPLE.close()
SAMPLE = open("PANCANCER/CNV/"+actual_sample+".txt","w")
SAMPLE.write('\t'.join(header.split()[1:])+"\n")
SAMPLE.write('\t'.join(info[1:])+"\n")
else:
SAMPLE.write('\t'.join(info[1:])+"\n")
Samples where CNV substraction removed an entire chromosome will throw an error here
In [13]:
!mkdir PANCANCER/FocalOutput
file_list = glob.glob("PANCANCER/CNV/TCGA*.txt")
for input_file in file_list:
filename = os.path.basename(input_file)
#only use tumor files specified in the Barcode by TCGA-xx-xxxx-0xx-xxx-xxxxx-xx
if filename[13] == '0':
!cat FocalAmplifications_fromSNPArray_noChrY.R | R --slave --args $input_file PANCANCER/FocalOutput/$filename Breast 100 > tmp
In [14]:
!./firehose_get -b -o RSEM_genes_normalized stddata 2014_02_15 PANCAN12 >> PANCANCER.download.log
In [17]:
!tar xzf stddata__2014_02_15/PANCAN12/20140215/gdac.broadinstitute.org_PANCAN12.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2014021500.0.0.tar.gz -C ./PANCANCER/
!mv PANCANCER/gdac.broadinstitute.org_PANCAN12.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2014021500.0.0 PANCANCER/RNASeq
In [18]:
expression_data=pandas.io.parsers.read_csv("PANCANCER/RNASeq/PANCAN12.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt", header=0, skiprows=[1], sep="\t")
for column in expression_data.columns:
if column == "Hybridization REF":
continue
SAMPLE = open("PANCANCER/RNASeq/"+column+".txt","w")
SAMPLE.write("Gene\tRSEM normalized\n")
column_count = len(expression_data.index)
for i in range(0,column_count):
SAMPLE.write(str(expression_data['Hybridization REF'][i])+"\t"+str(expression_data[column][i])+"\n")
SAMPLE.close()
In [19]:
!./firehose_get -b -o Mutation_Packager_Calls stddata 2014_02_15 PANCAN12 >> PANCANCER.download.log
In [20]:
!tar xzf stddata__2014_02_15/PANCAN12/20140215/gdac.broadinstitute.org_PANCAN12.Mutation_Packager_Calls.Level_3.2014021500.0.0.tar.gz -C ./PANCANCER/
!mv PANCANCER/gdac.broadinstitute.org_PANCAN12.Mutation_Packager_Calls.Level_3.2014021500.0.0 PANCANCER/SomaticMutations
In [21]:
!./firehose_get -b -o Clinical stddata 2014_02_15 PANCAN12 >> PANCANCER.download.log
In [22]:
!tar xzf stddata__2014_02_15/PANCAN12/20140215/gdac.broadinstitute.org_PANCAN12.Merge_Clinical.Level_1.2014021500.0.0.tar.gz -C ./PANCANCER/
!tar xzf stddata__2014_02_15/PANCAN12/20140215/gdac.broadinstitute.org_PANCAN12.Clinical_Pick_Tier1.Level_4.2014021500.0.0.tar.gz -C ./PANCANCER/
!mv PANCANCER/gdac.broadinstitute.org_PANCAN12.Merge_Clinical.Level_1.2014021500.0.0 PANCANCER/Clinical
!mv PANCANCER/gdac.broadinstitute.org_PANCAN12.Clinical_Pick_Tier1.Level_4.2014021500.0.0 PANCANCER/Clinical
!mv PANCANCER/Clinical/gdac.broadinstitute.org_PANCAN12.Clinical_Pick_Tier1.Level_4.2014021500.0.0/PANCAN12.clin.merged.picked.txt PANCANCER/Clinical/PANCANCER.clin.merged.picked.txt
In [26]:
clinical_expand_data=pandas.io.parsers.read_csv("PANCANCER/Clinical/PANCAN12.clin.merged.txt", header=37, index_col=0, sep="\t")
clinical_picked_data=pandas.io.parsers.read_csv("PANCANCER/Clinical/PANCANCER.clin.merged.picked.txt", header=0, index_col=0, sep="\t")
for column in clinical_expand_data.columns:
SAMPLE = open("PANCANCER/Clinical/"+str(column).upper()+".txt","w")
for index in clinical_picked_data.index:
SAMPLE.write(index+"\t"+str(clinical_picked_data.loc[[index],[column]].values[0,0])+"\n")
for index in clinical_expand_data.index:
SAMPLE.write(index+"\t"+str(clinical_expand_data.loc[[index],[column]].values[0,0])+"\n")
SAMPLE.close()
In [ ]: