I have saved the detection p-values in .csv files in the MINFI processing pipeline. Here I am just converting those files into HDFS to make it a bit easier to read in the data and manipulate.
For now I am also saving these in compressed form as most of the p-values are 0.
In [1]:
    
PATH = '/cellar/users/agross/TCGA_Code/Methlation/'
    
In [2]:
    
cd $PATH
    
    
In [3]:
    
import NotebookImport
from Setup.Imports import *
    
    
    
    
Epic Data
In [5]:
    
epic = pd.read_csv(PATH + 'data/EPIC_ITALY/detectionP.csv',
                   index_col=0)
pData = pd.read_csv(PATH + 'data/EPIC_ITALY/pData.csv',
                    dtype='str', index_col=0)
epic.columns = epic.columns.map(lambda s: '_'.join(s.split('_')[1:]))
epic = epic.replace(0, nan)
epic = epic.stack()
    
    
Hannum
In [6]:
    
hannum = pd.read_csv(PATH + 'data/Hannum/detectionP.csv',
                   index_col=0)
pData = pd.read_csv(PATH + 'data/Hannum/pData.csv',
                    dtype='str', index_col=0)
hannum.columns = hannum.columns.map(lambda s: pData.Sample_Name[s])
hannum = hannum.replace(0, nan)
hannum = hannum.stack()
    
UCSD
In [7]:
    
ucsd = pd.read_csv(PATH + 'data/UCSD_Methylation/detectionP.csv',
                index_col=0)
p = pd.read_csv(PATH + 'data/UCSD_Methylation/pData.csv',
                index_col=0)
ucsd.columns = p.Sample_Name
ucsd = ucsd.replace(0, nan)
ucsd = ucsd.stack()
    
In [8]:
    
detection_p = pd.concat([ucsd, hannum, epic])
    
In [9]:
    
detection_p = detection_p.reset_index()
    
In [10]:
    
detection_p.to_hdf(HDFS_DIR + 'dx_methylation.h5', 'detection_p')