Here we are running BMIQ normalization on all of our quantile-normalized data together. I am using the implementation provided by Steve Horvath along with his recent methylation-age paper. We are doing this as a result of this paper's recomendation to run both quantile normalization and BMIQ in series on the same datasets.
In [182]:
%%file ./sge/run_model.py
#!/cellar/users/agross/anaconda2/bin/ipython
import sys
import pandas as pd
import rpy2.robjects as robjects
from pandas.rpy.common import convert_to_r_dataframe
from pandas.rpy.common import convert_robj
from IPython.display import clear_output
store = sys.argv[1]
table_name = sys.argv[2]
gold_standard_name = sys.argv[3]
batch = sys.argv[4]
print sys.argv
batches = pd.read_csv('/cellar/users/agross/TCGA_Code/Methlation/PreProcessing/batch.csv',
names=['sample','batch'], header=None)
patients = list(batches[batches.batch == 1].sample)
betas = pd.read_hdf(store, table_name, columns=patients)
gold_standard = pd.read_hdf(store, gold_standard_name)
betas = betas.ix[gold_standard.index]
if betas.isnull().sum().sum() > 0:
betas = betas.T.fillna(gold_standard).T
robjects.r.library('WGCNA');
robjects.r.source("/cellar/users/agross/Data/MethylationAge/Horvath/NORMALIZATION.R");
df_r = robjects.r.t(convert_to_r_dataframe(betas))
gs = list(gold_standard.ix[betas.index])
gs_r = robjects.FloatVector(gs)
del betas
data_n = robjects.r.BMIQcalibration(df_r, gs_r)
data_n = convert_robj(data_n).T
data_n.columns = data_n.columns.map(lambda s: s.replace('.','-'))
data_n.columns = data_n.columns.map(lambda s: s[1:] if s.startswith('X') else s)
print data_n.shape
print store
print '{}_BMIQ_batch_{}'.format(table_name, batch)
data_n.to_hdf('{}_BMIQ.h5'.format(store[:-3]), '{}_batch_{}'.format(table_name, batch))
In [183]:
!chmod 755 ./sge/run_model.py
In [184]:
import pandas as pd
import os as os
In [185]:
store = '/cellar/users/agross/Data/tmp/methylation_norm.h5'
table = 'betas_adj'
gold_standard = 'Hannum_gold_standard'
In [5]:
df = pd.read_hdf(store, table)
In [103]:
del s['betas_adj']
In [107]:
df.to_hdf(store, table, format='t')
In [97]:
s = pd.HDFStore(store)
In [186]:
def generate_sge(njobs, store, table, gold_standard,
script_dir='.', threads=16):
print 'Writing scripts and SGE driver to {}'.format(script_dir)
if not os.path.isdir(script_dir):
os.makedirs(script_dir)
sge = ['#! /bin/csh']
sge += ['#$ -S /bin/csh']
sge += ['#$ -o {}'.format(script_dir)]
sge += ['#$ -e {}'.format(script_dir)]
sge += ['#$ -cwd']
sge += ['#$ -t 1-{}'.format(njobs)]
sge += ['#$ -tc {}'.format(threads)]
sge += ['hostname']
sge += ['date']
sge += [('{}/run_model.py {} {} {} '
'$SGE_TASK_ID').format(script_dir, store, table,
gold_standard)]
sge += ['date']
sge = '\n'.join(sge)
f = open('{}/sge_{}_BMIQ.sh'.format(script_dir, table),
'wb')
f.write(sge)
f.close()
In [187]:
s = pd.Series(range(len(df.columns)), df.columns)
batch = pd.np.floor(s / 10) + 1
batch.to_csv('./batch.csv')
batch.max()
Out[187]:
In [190]:
generate_sge(183, store, table, gold_standard,
'/cellar/users/agross/TCGA_Code/Methlation/PreProcessing/sge',
threads=36)
In [181]:
!./sge/run_model.py $store $table $gold_standard 100
In [13]:
#store = pd.HDFStore('/data_ssd/methylation_norm.h5')
#store.append('quant_BMIQ_adj', data_n)
#store.create_table_index('quant_BMIQ_adj', optlevel=9, kind='full')