In [6]:
%%file run_model.py
#!/cellar/users/agross/anaconda2/bin/python

import sys

import pandas as pd
import statsmodels.api as sm

store = sys.argv[1]
outdir = sys.argv[2]
table_name = sys.argv[3]

fmla = 'marker ~ age + CD4T + CD8T + NK + Bcell + Mono + study'

c_age = pd.read_hdf(store, 'age')
b_age = pd.read_hdf(store, 'bio_age')
gender = pd.read_hdf(store, 'gender')
age = c_age

cell_counts = pd.read_hdf(store, 'cell_counts')
df = pd.read_hdf(store, table_name)
df = df.dropna(1)

intercept = pd.Series(1, index=df.columns)
X = pd.concat([intercept, age, cell_counts.CD4T, cell_counts.CD8T,
               cell_counts.NK, cell_counts.Bcell, cell_counts.Mono,
               gender], axis=1, 
              keys=['Intercept','age', 'CD4T','CD8T','NK','Bcell','Mono',
                    'gender'])
X = X.dropna()
idx = X.index.intersection(df.columns)
X = X.ix[idx]
df = df.ix[:, idx]

X_HIV = X[['Intercept','CD4T','CD8T','NK','Bcell','Mono','gender']]

p = {}
d_hiv = {}

for i,y in df.iterrows():
    y.name = 'marker'
    mod_all = sm.OLS(y, X)
    res_ref = mod_all.fit()
    p[i] = res_ref.params
    
    mod_hiv = sm.OLS(y, X_HIV)
    m1 = mod_hiv.fit()
    d_hiv[i] = res_ref.compare_lr_test(m1)

p = pd.DataFrame(p).T
d_hiv = pd.DataFrame(d_hiv, index=['LR','p','df']).T

out = pd.concat([p, d_hiv], 
                keys=['multi_variate','age_LR'],
                axis=1)
out.to_csv('{}/{}_age_out.csv'.format(outdir, table_name))


Overwriting run_model.py

In [7]:
!chmod 755 run_model.py

In [8]:
import os as os

In [8]:


In [9]:
def generate_sge(jobs, store, table_prefix, outdir, script_dir='./', threads=16):
    print 'Writing scripts and SGE driver to {}'.format(script_dir)
    if not os.path.isdir(script_dir):
        os.makedirs(script_dir)

    sge =  ['#! /bin/csh']
    sge += ['#$ -S /bin/csh']
    sge += ['#$ -o {}'.format(script_dir)]
    sge += ['#$ -e {}'.format(script_dir)]
    sge += ['#$ -cwd']
    sge += ['#$ -t 1-{}'.format(len(jobs) + 1)]
    sge += ['#$ -tc {}'.format(threads)]
    
    sge += ['hostname']
    sge += ['date']
    sge += ['{}/run_model.py {} {} {}/chunk_$SGE_TASK_ID'.format(script_dir, store, outdir,
                                                                 table_prefix)]
    sge += ['date']
    
    sge = '\n'.join(sge)
    
    f = open('{}/sge_{}.sh'.format(script_dir, table_prefix), 'wb')
    f.write(sge)
    f.close()

In [5]:
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
outdir = '/cellar/users/agross/Data/tmp'
#px = ['in_set_s1','in_set_s2','in_set_s3', 'out_set_s1','out_set_s3']
px = ['in_set_s1','in_set_s2','in_set_s3']

for table_prefix in px:
    path = outdir + '/' + table_prefix
    if not os.path.isdir(path):
        os.makedirs(path)
    generate_sge(range(99), store, table_prefix, outdir, threads=100)


Writing scripts and SGE driver to ./
Writing scripts and SGE driver to ./
Writing scripts and SGE driver to ./

I have a discrepency with SGE and the way I number my tables... this is a patch to just run this table seperately. I will try and fix the bug ASAP.


In [10]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir in_set_s1/chunk_0

In [12]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir in_set_s2/chunk_0

In [11]:
covariates = ['age', 'CD4T','CD8T','NK','Bcell','Mono']
cov = ' '.join(covariates)
!./run_model.py $store $outdir in_set_s3/chunk_0