In [53]:
%%file run_model.py
#!/cellar/users/agross/anaconda2/bin/python
import sys
import pandas as pd
import statsmodels.api as sm
store = sys.argv[1]
outdir = sys.argv[2]
table_name = sys.argv[3]
out_suffix = sys.argv[4]
covariates = sys.argv[5:]
print sys.argv
c_age = pd.read_hdf(store, 'age')
hiv = pd.read_hdf(store, 'HIV')
cell_counts = pd.read_hdf(store, 'cell_counts')
df = pd.read_hdf(store, table_name)
intercept = pd.Series(1, index=hiv.index)
X = pd.concat([intercept, hiv, cell_counts.CD4T, cell_counts.CD8T,
cell_counts.NK, cell_counts.Bcell, cell_counts.Mono,
c_age], axis=1,
keys=['Intercept','HIV', 'CD4T','CD8T','NK','Bcell','Mono',
'age'])
X = X.ix[df.columns]
X = X.dropna()
X = X[['HIV','Intercept'] + covariates]
X_reduced = X[['Intercept'] + covariates]
print X.shape
p = {}
d_hiv = {}
w = (len(hiv) - hiv.map(hiv.value_counts())).astype(float) / len(hiv)
w = w.ix[X.index]
df = df.ix[:, X.index]
for i,y in df.iterrows():
y.name = 'marker'
mod_all = sm.WLS(y, X)
res_ref = mod_all.fit()
p[i] = res_ref.params
mod_reduced = sm.WLS(y, X_reduced)
m1 = mod_reduced.fit()
d_hiv[i] = res_ref.compare_lr_test(m1)
p = pd.DataFrame(p).T
d_hiv = pd.DataFrame(d_hiv, index=['LR','p','df']).T
out = pd.concat([p, d_hiv],
keys=['multi_variate','HIV_LR'],
axis=1)
out.to_csv('{}/{}_{}.csv'.format(outdir, table_name, out_suffix))
In [54]:
import os as os
import pandas as pd
In [55]:
!chmod 755 run_model.py
In [56]:
pwd = !pwd
pwd = pwd[0]
In [62]:
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
outdir = '/cellar/users/agross/Data/tmp'
#px = ['in_set_s1','in_set_s2','in_set_s3', 'out_set_s1','out_set_s3']
table_prefix = 'hiv_consented'
path = outdir + '/' + table_prefix
if not os.path.isdir(path):
os.makedirs(path)
In [63]:
def generate_sge(covariates, njobs, store, table_prefix, outdir, out_suffix,
script_dir='.', threads=16):
print 'Writing scripts and SGE driver to {}'.format(script_dir)
if not os.path.isdir(script_dir):
os.makedirs(script_dir)
sge = ['#! /bin/csh']
sge += ['#$ -S /bin/csh']
sge += ['#$ -o {}'.format(script_dir)]
sge += ['#$ -e {}'.format(script_dir)]
sge += ['#$ -cwd']
sge += ['#$ -t 1-{}'.format(njobs)]
sge += ['#$ -tc {}'.format(threads)]
sge += ['hostname']
sge += ['date']
cov = ' '.join(covariates)
sge += [('{}/run_model.py {} {} {}/chunk_$SGE_TASK_ID '
'{} {}').format(script_dir, store, outdir,
table_prefix, out_suffix, cov)]
sge += ['date']
sge = '\n'.join(sge)
f = open('{}/sge_{}_{}.sh'.format(script_dir, table_prefix, out_suffix),
'wb')
f.write(sge)
f.close()
In [64]:
#generate_sge(['age'], 100, store, table_prefix, outdir, 'hiv_age', threads=100)
#generate_sge(['bio_age'], 100, store, table_prefix, outdir, 'hiv_bioage', threads=100)
#generate_sge(['age','bio_age'], 100, store, table_prefix, outdir, 'hiv_age_bio_age',
# threads=100)
In [65]:
generate_sge(['age', 'CD4T','CD8T','NK','Bcell','Mono'], 100, store,
table_prefix, outdir, 'hiv_age_cc', threads=100)
#generate_sge(['bio_age', 'CD4T','CD8T','NK','Bcell','Mono'], 100, store,
# table_prefix, outdir, 'hiv_bioage_cc', threads=100)
#generate_sge(['age','bio_age', 'CD4T','CD8T','NK','Bcell','Mono'], 100, store,
# table_prefix, outdir, 'hiv_age_bio_age_cc', threads=100)
In [66]:
#generate_sge(['CD4T','CD8T','NK','Bcell','Mono'], 100, store,
# table_prefix, outdir, 'hiv_cc', threads=100)
I have a discrepency with SGE and the way I number my tables... this is a patch to just run this table seperately. I will try and fix the bug ASAP.
In [67]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir $table_prefix/chunk_0 hiv_age_cc $cov