In [6]:
%%file run_model.py
#!/cellar/users/agross/anaconda2/bin/python
import sys
import pandas as pd
import statsmodels.api as sm
store = sys.argv[1]
outdir = sys.argv[2]
table_name = sys.argv[3]
fmla = 'marker ~ age + CD4T + CD8T + NK + Bcell + Mono + study'
c_age = pd.read_hdf(store, 'age')
b_age = pd.read_hdf(store, 'bio_age')
gender = pd.read_hdf(store, 'gender')
age = c_age
cell_counts = pd.read_hdf(store, 'cell_counts')
df = pd.read_hdf(store, table_name)
df = df.dropna(1)
intercept = pd.Series(1, index=df.columns)
X = pd.concat([intercept, age, cell_counts.CD4T, cell_counts.CD8T,
cell_counts.NK, cell_counts.Bcell, cell_counts.Mono,
gender], axis=1,
keys=['Intercept','age', 'CD4T','CD8T','NK','Bcell','Mono',
'gender'])
X = X.dropna()
idx = X.index.intersection(df.columns)
X = X.ix[idx]
df = df.ix[:, idx]
X_HIV = X[['Intercept','CD4T','CD8T','NK','Bcell','Mono','gender']]
p = {}
d_hiv = {}
for i,y in df.iterrows():
y.name = 'marker'
mod_all = sm.OLS(y, X)
res_ref = mod_all.fit()
p[i] = res_ref.params
mod_hiv = sm.OLS(y, X_HIV)
m1 = mod_hiv.fit()
d_hiv[i] = res_ref.compare_lr_test(m1)
p = pd.DataFrame(p).T
d_hiv = pd.DataFrame(d_hiv, index=['LR','p','df']).T
out = pd.concat([p, d_hiv],
keys=['multi_variate','age_LR'],
axis=1)
out.to_csv('{}/{}_age_out.csv'.format(outdir, table_name))
In [7]:
!chmod 755 run_model.py
In [8]:
import os as os
In [8]:
In [9]:
def generate_sge(jobs, store, table_prefix, outdir, script_dir='./', threads=16):
print 'Writing scripts and SGE driver to {}'.format(script_dir)
if not os.path.isdir(script_dir):
os.makedirs(script_dir)
sge = ['#! /bin/csh']
sge += ['#$ -S /bin/csh']
sge += ['#$ -o {}'.format(script_dir)]
sge += ['#$ -e {}'.format(script_dir)]
sge += ['#$ -cwd']
sge += ['#$ -t 1-{}'.format(len(jobs) + 1)]
sge += ['#$ -tc {}'.format(threads)]
sge += ['hostname']
sge += ['date']
sge += ['{}/run_model.py {} {} {}/chunk_$SGE_TASK_ID'.format(script_dir, store, outdir,
table_prefix)]
sge += ['date']
sge = '\n'.join(sge)
f = open('{}/sge_{}.sh'.format(script_dir, table_prefix), 'wb')
f.write(sge)
f.close()
In [5]:
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
outdir = '/cellar/users/agross/Data/tmp'
#px = ['in_set_s1','in_set_s2','in_set_s3', 'out_set_s1','out_set_s3']
px = ['in_set_s1','in_set_s2','in_set_s3']
for table_prefix in px:
path = outdir + '/' + table_prefix
if not os.path.isdir(path):
os.makedirs(path)
generate_sge(range(99), store, table_prefix, outdir, threads=100)
I have a discrepency with SGE and the way I number my tables... this is a patch to just run this table seperately. I will try and fix the bug ASAP.
In [10]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir in_set_s1/chunk_0
In [12]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir in_set_s2/chunk_0
In [11]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir in_set_s3/chunk_0