In [53]:
%%file run_model.py
#!/cellar/users/agross/anaconda2/bin/python

import sys
import pandas as pd
import statsmodels.api as sm

store = sys.argv[1]
outdir = sys.argv[2]
table_name = sys.argv[3]
out_suffix = sys.argv[4]
covariates = sys.argv[5:]
print sys.argv

c_age = pd.read_hdf(store, 'age')
hiv = pd.read_hdf(store, 'HIV')
cell_counts = pd.read_hdf(store, 'cell_counts')
df = pd.read_hdf(store, table_name)

intercept = pd.Series(1, index=hiv.index)

X = pd.concat([intercept, hiv, cell_counts.CD4T, cell_counts.CD8T,
               cell_counts.NK, cell_counts.Bcell, cell_counts.Mono,
               c_age], axis=1, 
              keys=['Intercept','HIV', 'CD4T','CD8T','NK','Bcell','Mono',
                    'age'])
X = X.ix[df.columns]
X = X.dropna()
X = X[['HIV','Intercept'] + covariates]

X_reduced = X[['Intercept'] + covariates]
print X.shape

p = {}
d_hiv = {}
w = (len(hiv) - hiv.map(hiv.value_counts())).astype(float) / len(hiv)
w = w.ix[X.index]
df = df.ix[:, X.index]

for i,y in df.iterrows():
    y.name = 'marker'
    mod_all = sm.WLS(y, X)
    res_ref = mod_all.fit()
    p[i] = res_ref.params
    
    mod_reduced = sm.WLS(y, X_reduced)
    m1 = mod_reduced.fit()
    d_hiv[i] = res_ref.compare_lr_test(m1)

p = pd.DataFrame(p).T
d_hiv = pd.DataFrame(d_hiv, index=['LR','p','df']).T

out = pd.concat([p, d_hiv], 
                keys=['multi_variate','HIV_LR'],
                axis=1)
out.to_csv('{}/{}_{}.csv'.format(outdir, table_name, out_suffix))


Overwriting run_model.py

In [54]:
import os as os
import pandas as pd

In [55]:
!chmod 755 run_model.py

In [56]:
pwd = !pwd
pwd = pwd[0]

In [62]:
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
outdir = '/cellar/users/agross/Data/tmp'
#px = ['in_set_s1','in_set_s2','in_set_s3', 'out_set_s1','out_set_s3']
table_prefix = 'hiv_consented'
path = outdir + '/' + table_prefix
if not os.path.isdir(path):
    os.makedirs(path)

In [63]:
def generate_sge(covariates, njobs, store, table_prefix, outdir, out_suffix,
                 script_dir='.', threads=16):
    print 'Writing scripts and SGE driver to {}'.format(script_dir)
    if not os.path.isdir(script_dir):
        os.makedirs(script_dir)

    sge =  ['#! /bin/csh']
    sge += ['#$ -S /bin/csh']
    sge += ['#$ -o {}'.format(script_dir)]
    sge += ['#$ -e {}'.format(script_dir)]
    sge += ['#$ -cwd']
    sge += ['#$ -t 1-{}'.format(njobs)]
    sge += ['#$ -tc {}'.format(threads)]
    
    sge += ['hostname']
    sge += ['date']
    cov = ' '.join(covariates)
    sge += [('{}/run_model.py {} {} {}/chunk_$SGE_TASK_ID '  
             '{} {}').format(script_dir, store, outdir, 
                             table_prefix, out_suffix, cov)]
    sge += ['date']
    
    sge = '\n'.join(sge)
    
    f = open('{}/sge_{}_{}.sh'.format(script_dir, table_prefix, out_suffix), 
             'wb')
    f.write(sge)
    f.close()

In [64]:
#generate_sge(['age'], 100, store, table_prefix, outdir, 'hiv_age', threads=100) 
#generate_sge(['bio_age'], 100, store, table_prefix, outdir, 'hiv_bioage', threads=100) 
#generate_sge(['age','bio_age'], 100, store, table_prefix, outdir, 'hiv_age_bio_age', 
#             threads=100)

In [65]:
generate_sge(['age', 'CD4T','CD8T','NK','Bcell','Mono'], 100, store, 
             table_prefix, outdir, 'hiv_age_cc', threads=100) 
#generate_sge(['bio_age', 'CD4T','CD8T','NK','Bcell','Mono'], 100, store, 
#             table_prefix, outdir, 'hiv_bioage_cc', threads=100) 
#generate_sge(['age','bio_age', 'CD4T','CD8T','NK','Bcell','Mono'], 100, store, 
#             table_prefix, outdir, 'hiv_age_bio_age_cc', threads=100)


Writing scripts and SGE driver to .

In [66]:
#generate_sge(['CD4T','CD8T','NK','Bcell','Mono'], 100, store, 
#             table_prefix, outdir, 'hiv_cc', threads=100)

I have a discrepency with SGE and the way I number my tables... this is a patch to just run this table seperately. I will try and fix the bug ASAP.


In [67]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir $table_prefix/chunk_0 hiv_age_cc $cov


['./run_model.py', '/cellar/users/agross/Data/tmp/for_parallel.h5', '/cellar/users/agross/Data/tmp', 'hiv_consented/chunk_0', 'hiv_age_cc', 'age', 'CD4T', 'CD8T', 'NK', 'Bcell', 'Mono']
(173, 8)