Initialize mRNA Storage Structures


In [1]:
pwd


Out[1]:
u'/cellar/users/agross/TCGA_Code/DX/Notebooks/Preprocessing'

Storing expression data in .csv files is great for portability but has the disadvantage of being very slow to read into memory. Here I am going through the most recent Firehose run, reading in the expression data for each cancer, and saving each file in HDF5 format. This should make working with this data with Pandas much easier.


In [2]:
import os as os
import pandas as pd

In [3]:
import Data.Firehose as FH

In [4]:
#path = '/cellar/users/agross/TCGA_Code/TCGA/Data/Firehose__2014_07_15/'
#path = '/cellar/users/agross/TCGA_Code/CancerData/Data/Firehose__2015_04_02/'

In [5]:
store = pd.HDFStore(RNA_STORE)

In [6]:
rna = {}
for c in os.listdir(FIREHOSE_PATH + 'stddata'):
    try:
        rna[c] = FH.read_rnaSeq(FIREHOSE_PATH, c, tissue_code='All')
        if c not in store.keys():
            store.append(c, rna[c])
            store.create_table_index(c, optlevel=9, kind='full')
    except:
        print c
rna_df = pd.concat(rna.values(), axis=1)

cols = list(rna_df.columns)
pts = [c for c in cols if (c[0],'01') in cols and (c[0], '11') in 
       cols and c[1] in ['01','11']]
matched_tn = rna_df[pts]
matched_tn = matched_tn.groupby(axis=1, level=[0,1]).first()


ESCA
STES
FPPP
STAD

In [7]:
codes = pd.concat(rna, axis=1).columns
codes = pd.Series(codes.get_level_values(0), codes.get_level_values(1))
codes = codes[codes.isin(['KIPAN','GBMLGG']) == False]
codes = codes.groupby(level=0).first()
codes.name = 'codes'

In [8]:
codes.ix[matched_tn.columns.get_level_values(0).unique()].value_counts()


Out[8]:
BRCA    112
KIRC     72
THCA     59
LUAD     58
PRAD     52
LUSC     51
LIHC     50
HNSC     43
COAD     41
KIRP     32
KICH     25
UCEC     23
BLCA     19
CHOL      9
READ      9
PAAD      4
CESC      3
PCPG      3
THYM      2
SARC      2
dtype: int64

In [12]:
store['codes'] = codes

In [13]:
matched_tn.to_hdf(store, 'matched_tn')

In [14]:
store.close()