Initialize mRNA Storage Structures



In [1]:

    
pwd









    Out[1]:





u'/cellar/users/agross/TCGA_Code/DX/Notebooks/Preprocessing'

Storing expression data in .csv files is great for portability but has the disadvantage of being very slow to read into memory. Here I am going through the most recent Firehose run, reading in the expression data for each cancer, and saving each file in HDF5 format. This should make working with this data with Pandas much easier.



In [2]:

    
import os as os
import pandas as pd



In [3]:

    
import Data.Firehose as FH



In [4]:

    
#path = '/cellar/users/agross/TCGA_Code/TCGA/Data/Firehose__2014_07_15/'
#path = '/cellar/users/agross/TCGA_Code/CancerData/Data/Firehose__2015_04_02/'



In [5]:

    
store = pd.HDFStore(RNA_STORE)



In [6]:

    
rna = {}
for c in os.listdir(FIREHOSE_PATH + 'stddata'):
    try:
        rna[c] = FH.read_rnaSeq(FIREHOSE_PATH, c, tissue_code='All')
        if c not in store.keys():
            store.append(c, rna[c])
            store.create_table_index(c, optlevel=9, kind='full')
    except:
        print c
rna_df = pd.concat(rna.values(), axis=1)

cols = list(rna_df.columns)
pts = [c for c in cols if (c[0],'01') in cols and (c[0], '11') in 
       cols and c[1] in ['01','11']]
matched_tn = rna_df[pts]
matched_tn = matched_tn.groupby(axis=1, level=[0,1]).first()









    



ESCA
STES
FPPP
STAD



In [7]:

    
codes = pd.concat(rna, axis=1).columns
codes = pd.Series(codes.get_level_values(0), codes.get_level_values(1))
codes = codes[codes.isin(['KIPAN','GBMLGG']) == False]
codes = codes.groupby(level=0).first()
codes.name = 'codes'



In [8]:

    
codes.ix[matched_tn.columns.get_level_values(0).unique()].value_counts()









    Out[8]:





BRCA    112
KIRC     72
THCA     59
LUAD     58
PRAD     52
LUSC     51
LIHC     50
HNSC     43
COAD     41
KIRP     32
KICH     25
UCEC     23
BLCA     19
CHOL      9
READ      9
PAAD      4
CESC      3
PCPG      3
THYM      2
SARC      2
dtype: int64



In [12]:

    
store['codes'] = codes



In [13]:

    
matched_tn.to_hdf(store, 'matched_tn')



In [14]:

    
store.close()