In [1]:
cd ..
In [2]:
import os as os
import pandas as pd
In [3]:
import NotebookImport
from Global_Parameters import *
In [4]:
import Data.Firehose as FH
from Helpers.Pandas import *
In [5]:
p = FIREHOSE_PATH + 'stddata/'
ext = ('/methylation/humanmethylation450/jhu_usc_edu/Level_3/'
'within_bioassay_data_set_function/data/data.txt')
In [6]:
fs = !ls $p*$ext
In [7]:
len(fs)
Out[7]:
In [8]:
METH_STORE = HDFS_DIRECTORY + '/TCGA_methylation_T_2015_04_02.h5'
In [9]:
store = pd.HDFStore(METH_STORE)
In [23]:
%%time
for f in fs:
cancer = f.split(p)[1].split('/')[0]
print 'starting ' + cancer
ann = pd.read_table(f, nrows=1, index_col=0)
cols = ti(ann.ix['Composite Element REF'] == 'Beta_value')
cols = pd.Index(['Hybridization REF']).append(cols)
df = pd.read_table(f, index_col=0, usecols=cols, skiprows=[1])
df = FH.fix_barcode_columns(df)
print 'saving ' + cancer
store.append(cancer, df)
store.create_table_index(cancer, optlevel=9, kind='full')
In [10]:
matched_tn = {}
for f in store.keys():
df = pd.read_hdf(store.filename, f)
cols = list(df.columns)
pts = [c for c in cols if (c[0],'01') in cols and (c[0], '11') in
cols and c[1] in ['01','11']]
if len(pts) > 0:
matched_tn[f] = df[pts]
print '{} : {}'.format(f, len(pts))
else:
print '{} fail'.format(f)
In [11]:
mtn = pd.concat(matched_tn.values(), axis=1)
In [12]:
mtn.to_hdf(store, 'matched_tn', format='t')
In [13]:
store.close()