Here we run the data normalization pipeline following Steve Horvath's method. Briefly he uses a variant of BMIQ which normalizes all of the data to a gold standard. His method only uses a subset of the probes, so we can throw out the majority of the data here.
Unlike the normalization with the full data I do not yet do an adjustment for cellular composition as this was not in Horvath's pipeline. Rather, I do the adjustment after the normalization.
In [1]:
cd ..
In [2]:
import NotebookImport
from Setup.Imports import *
In [3]:
import os as os
import pandas as pd
from pandas.rpy.common import convert_to_r_dataframe, convert_robj
import rpy2.robjects as robjects
from IPython.display import clear_output
Load Horvath normalization source into R namespace.
In [4]:
robjects.r.library('WGCNA');
robjects.r.source("/cellar/users/agross/Data/MethylationAge/Horvath/NORMALIZATION.R")
clear_output()
Read in Betas
In [5]:
path = '/cellar/users/agross/TCGA_Code/Methlation/data/'
f = path + 'all_betas_raw.csv'
df = pd.read_csv(f, low_memory=True, header=0, index_col=0)
In [6]:
labels = pd.read_csv(path + 'all_betas_raw_pdata.csv',
index_col=0)
Normalization Step
In [7]:
gold_standard = pd.read_csv('/cellar/users/agross/Data/MethylationAge/Horvath/probeAnnotation21kdatMethUsed.csv', index_col=0)
horvath = pd.read_table('/cellar/users/agross/TCGA_Code/Methlation/data/Horvath_Model.csv', index_col=0, skiprows=[0,1])
intercept = horvath.CoefficientTraining['(Intercept)']
horvath = horvath.iloc[1:]
In [8]:
df = df.ix[gold_standard.index]
df = df.T.fillna(gold_standard.goldstandard2).T
In [9]:
df_r = robjects.r.t(convert_to_r_dataframe(df))
gs = list(gold_standard.goldstandard2.ix[df.index])
gs_r = robjects.FloatVector(gs)
In [ ]:
del df
In [ ]:
data_n = robjects.r.BMIQcalibration(df_r, gs_r)
data_n = convert_robj(data_n).T
clear_output()
Now we need to fix the labels a little bit.
In [ ]:
c1 = pd.read_excel(ucsd_path + 'DESIGN_Fox_v2_Samples-ChipLAyout-Clinical UNMC-UCSD methylomestudy.xlsx',
'HIV- samples from OldStudy', index_col=0)
c2 = pd.read_excel(ucsd_path + 'DESIGN_Fox_v2_Samples-ChipLAyout-Clinical UNMC-UCSD methylomestudy.xlsx',
'HIV+ samples', index_col=0)
clinical = c1.append(c2)
In [ ]:
s = labels[labels.studyIndex == 's2'].sampleNames
ss = clinical[['Sample_Plate','Sample_Well']].sort(['Sample_Plate','Sample_Well'])
assert(alltrue(ss.Sample_Well == s))
In [ ]:
new_label = clinical.sort(['Sample_Plate','Sample_Well']).index
new_label = pd.Series(new_label, s.index)
new_names = labels['sampleNames'].replace(new_label.to_dict())
new_labels = labels['sampleNames'].ix[new_label.index] = new_label
new_labels = new_labels.combine_first(labels.sampleNames)
In [ ]:
labels['sampleNames'] = new_labels
new_labels2 = labels[labels.studyIndex == 's3'].sampleNames
new_labels2 = new_labels2.map(lambda s: '_'.join(s.split('_')[1:]))
new_labels2 = new_labels2.combine_first(new_labels)
labels['sampleNames'] = new_labels2
In [ ]:
data_n.columns = list(new_labels2)
data_n = data_n.astype(float)
data_n.to_hdf(HDFS_DIR + 'methylation_norm.h5','BMIQ_Horvath')
In [ ]:
labels.to_hdf(HDFS_DIR + 'methylation_norm.h5','labels')