I have saved the detection p-values in .csv files in the MINFI processing pipeline. Here I am just converting those files into HDFS to make it a bit easier to read in the data and manipulate.
For now I am also saving these in compressed form as most of the p-values are 0.
In [1]:
PATH = '/cellar/users/agross/TCGA_Code/Methlation/'
In [2]:
cd $PATH
In [3]:
import NotebookImport
from Setup.Imports import *
Epic Data
In [5]:
epic = pd.read_csv(PATH + 'data/EPIC_ITALY/detectionP.csv',
index_col=0)
pData = pd.read_csv(PATH + 'data/EPIC_ITALY/pData.csv',
dtype='str', index_col=0)
epic.columns = epic.columns.map(lambda s: '_'.join(s.split('_')[1:]))
epic = epic.replace(0, nan)
epic = epic.stack()
Hannum
In [6]:
hannum = pd.read_csv(PATH + 'data/Hannum/detectionP.csv',
index_col=0)
pData = pd.read_csv(PATH + 'data/Hannum/pData.csv',
dtype='str', index_col=0)
hannum.columns = hannum.columns.map(lambda s: pData.Sample_Name[s])
hannum = hannum.replace(0, nan)
hannum = hannum.stack()
UCSD
In [7]:
ucsd = pd.read_csv(PATH + 'data/UCSD_Methylation/detectionP.csv',
index_col=0)
p = pd.read_csv(PATH + 'data/UCSD_Methylation/pData.csv',
index_col=0)
ucsd.columns = p.Sample_Name
ucsd = ucsd.replace(0, nan)
ucsd = ucsd.stack()
In [8]:
detection_p = pd.concat([ucsd, hannum, epic])
In [9]:
detection_p = detection_p.reset_index()
In [10]:
detection_p.to_hdf(HDFS_DIR + 'dx_methylation.h5', 'detection_p')