BMIQ Normalization

Here we are running BMIQ normalization on all of our quantile-normalized data together. I am using the implementation provided by Steve Horvath along with his recent methylation-age paper. We are doing this as a result of this paper's recomendation to run both quantile normalization and BMIQ in series on the same datasets.


In [182]:
%%file ./sge/run_model.py
#!/cellar/users/agross/anaconda2/bin/ipython

import sys
import pandas as pd

import rpy2.robjects as robjects
from pandas.rpy.common import convert_to_r_dataframe
from pandas.rpy.common import convert_robj
from IPython.display import clear_output

store = sys.argv[1]
table_name = sys.argv[2]
gold_standard_name = sys.argv[3]
batch = sys.argv[4]
print sys.argv

batches = pd.read_csv('/cellar/users/agross/TCGA_Code/Methlation/PreProcessing/batch.csv', 
                      names=['sample','batch'], header=None)
patients = list(batches[batches.batch == 1].sample)

betas = pd.read_hdf(store, table_name, columns=patients)
gold_standard = pd.read_hdf(store, gold_standard_name)
betas = betas.ix[gold_standard.index]
if betas.isnull().sum().sum() > 0:
    betas = betas.T.fillna(gold_standard).T
    
robjects.r.library('WGCNA');
robjects.r.source("/cellar/users/agross/Data/MethylationAge/Horvath/NORMALIZATION.R");

df_r = robjects.r.t(convert_to_r_dataframe(betas))
gs = list(gold_standard.ix[betas.index])
gs_r = robjects.FloatVector(gs)
del betas

data_n = robjects.r.BMIQcalibration(df_r, gs_r)
data_n = convert_robj(data_n).T

data_n.columns = data_n.columns.map(lambda s: s.replace('.','-'))
data_n.columns = data_n.columns.map(lambda s: s[1:] if s.startswith('X') else s)
print data_n.shape
print store
print '{}_BMIQ_batch_{}'.format(table_name, batch)
data_n.to_hdf('{}_BMIQ.h5'.format(store[:-3]), '{}_batch_{}'.format(table_name, batch))


Overwriting ./sge/run_model.py

In [183]:
!chmod 755 ./sge/run_model.py

In [184]:
import pandas as pd
import os as os

In [185]:
store = '/cellar/users/agross/Data/tmp/methylation_norm.h5'
table = 'betas_adj'
gold_standard = 'Hannum_gold_standard'

In [5]:
df = pd.read_hdf(store, table)

In [103]:
del s['betas_adj']

In [107]:
df.to_hdf(store, table, format='t')

In [97]:
s = pd.HDFStore(store)

In [186]:
def generate_sge(njobs, store, table, gold_standard,
                 script_dir='.', threads=16):
    print 'Writing scripts and SGE driver to {}'.format(script_dir)
    if not os.path.isdir(script_dir):
        os.makedirs(script_dir)

    sge =  ['#! /bin/csh']
    sge += ['#$ -S /bin/csh']
    sge += ['#$ -o {}'.format(script_dir)]
    sge += ['#$ -e {}'.format(script_dir)]
    sge += ['#$ -cwd']
    sge += ['#$ -t 1-{}'.format(njobs)]
    sge += ['#$ -tc {}'.format(threads)]
    
    sge += ['hostname']
    sge += ['date']
    sge += [('{}/run_model.py {} {} {} '
             '$SGE_TASK_ID').format(script_dir, store, table, 
                                    gold_standard)]
    sge += ['date']
    
    sge = '\n'.join(sge)
    
    f = open('{}/sge_{}_BMIQ.sh'.format(script_dir, table), 
             'wb')
    f.write(sge)
    f.close()

In [187]:
s = pd.Series(range(len(df.columns)), df.columns)
batch = pd.np.floor(s / 10) + 1
batch.to_csv('./batch.csv')
batch.max()


Out[187]:
183.0

In [190]:
generate_sge(183, store, table, gold_standard, 
             '/cellar/users/agross/TCGA_Code/Methlation/PreProcessing/sge', 
             threads=36)


Writing scripts and SGE driver to /cellar/users/agross/TCGA_Code/Methlation/PreProcessing/sge

In [181]:
!./sge/run_model.py $store $table $gold_standard 100


['/cellar/users/agross/TCGA_Code/Methlation/PreProcessing/sge/run_model.py', '/cellar/users/agross/Data/tmp/methylation_norm.h5', 'betas_adj', 'Hannum_gold_standard', '100']
Loading required package: dynamicTreeCut
Loading required package: flashClust

Attaching package: ‘flashClust’

The following object is masked from ‘package:stats’:

    hclust

==========================================================================
*
*  Package WGCNA 1.41.1 loaded.
*
*    Important note: It appears that your system supports multi-threading,
*    but it is not enabled within WGCNA in R. 
*    To allow multi-threading within WGCNA with all available cores, use 
*
*          allowWGCNAThreads()
*
*    within R. Use disableWGCNAThreads() to disable threading if necessary.
*    Alternatively, set the following environment variable on your system:
*
*          ALLOW_WGCNA_THREADS=<number_of_processors>
*
*    for example 
*
*          ALLOW_WGCNA_THREADS=8
*
*    To set the environment variable in linux bash shell, type 
*
*           export ALLOW_WGCNA_THREADS=8
*
*     before running R. Other operating systems or shells will
*     have a similar command to achieve the same aim.
*
==========================================================================



Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor

Loading required package: RPMM
Loading required package: cluster
[1] "Fitting EM beta mixture to goldstandard probes"
[1] Inf
[1] 0.00284889
[1] 0.001488458
[1] 0.001385431
[1] 0.00121727
[1] "Done"
ii= 1
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.004563755
[1] 0.003165958
[1] 0.002789814
[1] 0.002738367
[1] "Done"
[1] "Start normalising input probes"
ii= 2
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.002088403
[1] 0.001701839
[1] 0.001581493
[1] 0.001494624
[1] "Done"
[1] "Start normalising input probes"
ii= 3
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.003715084
[1] 0.003215857
[1] 0.003056814
[1] 0.002944045
[1] "Done"
[1] "Start normalising input probes"
ii= 4
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.001796054
[1] 0.001479678
[1] 0.001305084
[1] 0.001238874
[1] "Done"
[1] "Start normalising input probes"
ii= 5
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.00428869
[1] 0.003260741
[1] 0.002927392
[1] 0.002752399
[1] "Done"
[1] "Start normalising input probes"
ii= 6
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.004417906
[1] 0.003547183
[1] 0.003458557
[1] 0.003428129
[1] "Done"
[1] "Start normalising input probes"
ii= 7
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.006366752
[1] 0.005445768
[1] 0.005070871
[1] 0.004839397
[1] "Done"
[1] "Start normalising input probes"
ii= 8
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.00327086
[1] 0.002717354
[1] 0.002306499
[1] 0.00185628
[1] "Done"
[1] "Start normalising input probes"
ii= 9
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.007695471
[1] 0.006436979
[1] 0.005916033
[1] 0.00558456
[1] "Done"
[1] "Start normalising input probes"
ii= 10
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.001280389
[1] 0.001016756
[1] 0.001111275
[1] 0.00109407
[1] "Done"
[1] "Start normalising input probes"
(485512, 10)
/cellar/users/agross/Data/tmp/methylation_norm.h5
betas_adj_BMIQ_batch_100

In [13]:
#store = pd.HDFStore('/data_ssd/methylation_norm.h5')
#store.append('quant_BMIQ_adj', data_n)
#store.create_table_index('quant_BMIQ_adj', optlevel=9, kind='full')