One very important step to note is that we are adjusting each patient's beta-values by the expected value given their cell composition. We are including this step to reduce the variablility in methylation levels that is purely resulting from differing cellular compositions. We are preforming this step after quantile normalization, but before the BMIQ step. This could be done in other sections of the pipeline but should not have too big of an impact on downstream results.
In [1]:
cd /cellar/users/agross/TCGA_Code/Methlation/
In [2]:
import NotebookImport
from Setup.Imports import *
Read in quantile-normalized beta values and cell counts from the MINFI pipeline.
In [3]:
betas = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'betas')
betas = betas['s1']
betas = betas.groupby(axis=0, level=0).first()
cell_counts = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'cell_counts')
cell_counts = cell_counts.groupby(level=0).first()
cell_counts = cell_counts.ix[betas.columns]
Read in cell compositions from probe annotations.
In [4]:
flow_sorted_data = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5','flow_sorted_data')
flow_sorted_data = flow_sorted_data.groupby(level=0).first()
flow_sorted_data = flow_sorted_data.ix[betas.index]
cell_type = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5', 'label_map')
In [5]:
n2 = flow_sorted_data.groupby(cell_type, axis=1).mean()
avg = n2[cell_counts.columns].dot(cell_counts.T)
Adjust data to account for differences in cellular composition.
In [6]:
cc = avg.mean(1)
adj = (betas - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=1)
Here s1 is the Hannum data, which we used to train our model. We are normalizeing all of our data with the median value of the Hannum cohort as our reference for each probe.
In [7]:
gold_standard_ah = adj.median(1)
In [ ]:
store = pd.HDFStore(HDFS_DIR + 'methylation_norm.h5')
store['Hannum_gold_standard'] = gold_standard_ah
store.append('Hannum_adj', adj)
store.create_table_index('Hannum_adj', optlevel=9, kind='full')
In [14]:
betas = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'betas')
betas = betas['s3']
betas = betas.groupby(axis=0, level=0).first()
cell_counts = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'cell_counts')
cell_counts = cell_counts.groupby(level=0).first()
cell_counts = cell_counts.ix[betas.columns]
In [15]:
flow_sorted_data = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5','flow_sorted_data')
flow_sorted_data = flow_sorted_data.groupby(level=0).first()
flow_sorted_data = flow_sorted_data.ix[betas.index]
cell_type = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5', 'label_map')
In [16]:
n2 = flow_sorted_data.groupby(cell_type, axis=1).mean()
avg = n2[cell_counts.columns].dot(cell_counts.T)
In [17]:
cc = avg.mean(1)
adj = (betas - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=1)
In [19]:
store.append('EPIC_adj', adj)
store.create_table_index('EPIC_adj', optlevel=9, kind='full')
In [20]:
betas = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'betas')
betas = betas['s2']
betas = betas.groupby(axis=0, level=0).first()
cell_counts = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'cell_counts')
cell_counts = cell_counts.groupby(level=0).first()
cell_counts = cell_counts.ix[betas.columns]
In [21]:
flow_sorted_data = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5','flow_sorted_data')
flow_sorted_data = flow_sorted_data.groupby(level=0).first()
flow_sorted_data = flow_sorted_data.ix[betas.index]
cell_type = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5', 'label_map')
In [22]:
n2 = flow_sorted_data.groupby(cell_type, axis=1).mean()
avg = n2[cell_counts.columns].dot(cell_counts.T)
In [23]:
cc = avg.mean(1)
adj = (betas - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=1)
In [24]:
store.append('HIV_adj', adj)
store.create_table_index('HIV_adj', optlevel=9, kind='full')
In [25]:
del betas, adj, avg
In [33]:
s1 = pd.read_hdf(HDFS_DIR + 'methylation_norm.h5', 'Hannum_adj')
s2 = pd.read_hdf(HDFS_DIR + 'methylation_norm.h5', 'HIV_adj')
s3 = pd.read_hdf(HDFS_DIR + 'methylation_norm.h5', 'EPIC_adj')
In [34]:
sc = pd.concat([s1, s2, s3], axis=1)
In [36]:
del s1, s2, s3
In [40]:
store.append('betas_adj', sc)
store.create_table_index('betas_adj', optlevel=9, kind='full')
In [41]:
store.close()