Dump Methylation Data into a HDF Store

  • Should allow for much faster IO and quick local quires
  • I'm putting this on my computers disk as opposed to the network to limit network traffic with this big file

In [1]:
cd ..


/cellar/users/agross/TCGA_Code/DX/Notebooks

In [2]:
import os as os
import pandas as pd

In [3]:
import NotebookImport 
from Global_Parameters import *


importing IPython notebook from Global_Parameters

In [4]:
import Data.Firehose as FH
from Helpers.Pandas import *

In [5]:
p = FIREHOSE_PATH + 'stddata/'
ext = ('/methylation/humanmethylation450/jhu_usc_edu/Level_3/'
       'within_bioassay_data_set_function/data/data.txt')

In [6]:
fs = !ls $p*$ext

In [7]:
len(fs)


Out[7]:
34

In [8]:
METH_STORE = HDFS_DIRECTORY + '/TCGA_methylation_T_2015_04_02.h5'

In [9]:
store = pd.HDFStore(METH_STORE)

In [23]:
%%time 
for f in fs:
    cancer = f.split(p)[1].split('/')[0]
    print 'starting ' + cancer
    ann = pd.read_table(f, nrows=1, index_col=0)
    cols = ti(ann.ix['Composite Element REF'] == 'Beta_value')
    cols = pd.Index(['Hybridization REF']).append(cols)
    df = pd.read_table(f, index_col=0, usecols=cols, skiprows=[1])
    df = FH.fix_barcode_columns(df)
    print 'saving ' + cancer
    store.append(cancer, df)
    store.create_table_index(cancer, optlevel=9, kind='full')


starting ACC
saving ACC
starting BLCA
saving BLCA
starting BRCA
saving BRCA
starting CESC
saving CESC
starting CHOL
saving CHOL
starting COAD
saving COAD
starting DLBC
saving DLBC
starting ESCA
saving ESCA
starting GBM
saving GBM
starting HNSC
saving HNSC
starting KICH
saving KICH
starting KIRC
saving KIRC
starting KIRP
saving KIRP
starting LAML
saving LAML
starting LGG
saving LGG
starting LIHC
saving LIHC
starting LUAD
saving LUAD
starting LUSC
saving LUSC
starting MESO
saving MESO
starting OV
saving OV
starting PAAD
saving PAAD
starting PCPG
saving PCPG
starting PRAD
saving PRAD
starting READ
saving READ
starting SARC
saving SARC
starting SKCM
saving SKCM
starting STAD
saving STAD
starting STES
saving STES
starting TGCT
saving TGCT
starting THCA
saving THCA
starting THYM
saving THYM
starting UCEC
saving UCEC
starting UCS
saving UCS
starting UVM
saving UVM
CPU times: user 26min 3s, sys: 2min 17s, total: 28min 21s
Wall time: 51min 22s

Need to fix, getting FFPEs for BRCA, others


In [10]:
matched_tn = {}
for f in store.keys():
    df = pd.read_hdf(store.filename, f)
    cols = list(df.columns)
    pts = [c for c in cols if (c[0],'01') in cols and (c[0], '11') in 
               cols and c[1] in ['01','11']]
    if len(pts) > 0:
        matched_tn[f] = df[pts]
        print '{} : {}'.format(f, len(pts))
    else:
        print '{} fail'.format(f)


/ACC fail
/BLCA : 42
/BRCA : 180
/CESC : 6
/CHOL : 18
/COAD : 76
/DLBC fail
/ESCA : 32
/GBM : 2
/HNSC : 100
/KICH fail
/KIRC : 320
/KIRP : 90
/LAML fail
/LGG fail
/LIHC : 100
/LUAD : 58
/LUSC : 80
/MESO fail
/OV fail
/PAAD : 20
/PCPG : 6
/PRAD : 100
/READ : 14
/SARC : 8
/SKCM fail
/STAD : 4
/STES : 36
/TGCT fail
/THCA : 112
/THYM : 4
/UCEC : 66
/UCS fail
/UVM fail

In [11]:
mtn = pd.concat(matched_tn.values(), axis=1)

In [12]:
mtn.to_hdf(store, 'matched_tn', format='t')

In [13]:
store.close()