In [1]:
pwd
Out[1]:
Storing expression data in .csv files is great for portability but has the disadvantage of being very slow to read into memory. Here I am going through the most recent Firehose run, reading in the expression data for each cancer, and saving each file in HDF5 format. This should make working with this data with Pandas much easier.
In [2]:
import os as os
import pandas as pd
In [3]:
import Data.Firehose as FH
In [4]:
#path = '/cellar/users/agross/TCGA_Code/TCGA/Data/Firehose__2014_07_15/'
#path = '/cellar/users/agross/TCGA_Code/CancerData/Data/Firehose__2015_04_02/'
In [5]:
store = pd.HDFStore(RNA_STORE)
In [6]:
rna = {}
for c in os.listdir(FIREHOSE_PATH + 'stddata'):
try:
rna[c] = FH.read_rnaSeq(FIREHOSE_PATH, c, tissue_code='All')
if c not in store.keys():
store.append(c, rna[c])
store.create_table_index(c, optlevel=9, kind='full')
except:
print c
rna_df = pd.concat(rna.values(), axis=1)
cols = list(rna_df.columns)
pts = [c for c in cols if (c[0],'01') in cols and (c[0], '11') in
cols and c[1] in ['01','11']]
matched_tn = rna_df[pts]
matched_tn = matched_tn.groupby(axis=1, level=[0,1]).first()
In [7]:
codes = pd.concat(rna, axis=1).columns
codes = pd.Series(codes.get_level_values(0), codes.get_level_values(1))
codes = codes[codes.isin(['KIPAN','GBMLGG']) == False]
codes = codes.groupby(level=0).first()
codes.name = 'codes'
In [8]:
codes.ix[matched_tn.columns.get_level_values(0).unique()].value_counts()
Out[8]:
In [12]:
store['codes'] = codes
In [13]:
matched_tn.to_hdf(store, 'matched_tn')
In [14]:
store.close()