In [1]:
import os 
if os.getcwd().endswith('Parallel'):
    os.chdir('..')

In [2]:
import NotebookImport
from Benchmarks.Model_Comparison_MF import *


importing IPython notebook from Benchmarks/Model_Comparison_MF
importing IPython notebook from Benchmarks/Age_Models_All_Patients
importing IPython notebook from Setup/Imports
Populating the interactive namespace from numpy and matplotlib
importing IPython notebook from Setup/MethylationAgeModels
importing IPython notebook from Setup/Read_HIV_Data

Logit Transform and Normalize Methylation Data


In [3]:
k = ti((age < 68) & (age > 25))

In [4]:
dd = logit_adj(df_meth.ix[:, k])
m = dd.mean(1)
s = dd.std(1)
df_norm = dd.subtract(m, axis=0).divide(s, axis=0)
df_norm = df_norm.clip(-7,7)

Prepare Data for Association Tests

  • The association tests take a while to run in serial so we do them in a map-reduce type format
  • The idea is we break the data into 100 chunks, run the tests in parallel, and then combine the results
  • This is not entirely necissary but drops run-time from ~15 min to about 15 seconds

In [5]:
def chunkify_df(df, store, table_name, N=100):
    df = df.dropna(1)
    for i in range(N):
        g = df.index[i::N]
        dd = df.ix[g]
        dd.to_hdf(store, '{}/chunk_{}'.format(table_name, i))

In [8]:
gender.value_counts()


Out[8]:
F    1148
M     801
dtype: int64

In [9]:
labels.ix[k.intersection(df_meth.columns)].value_counts()


Out[9]:
s3    662
s1    538
s2    189
dtype: int64

In [8]:
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
store = pd.HDFStore(store)

store['labels'] = labels
store['bio_age'] = mc_adj_c
store['cell_counts'] = cell_counts
store['age'] = age
store['gender'] = gender == 'M'
#store['bio_age'] = age_adv.append(age_adv0)


/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:2441: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->index] [items->None]

  warnings.warn(ws, PerformanceWarning)

In [9]:
labels.ix[k.intersection(df_meth.columns)].value_counts()


Out[9]:
s3    662
s1    538
s2    189
dtype: int64

In [ ]:
chunkify_df(df_norm.ix[:, ti(labels == 's1')], store.filename, 'in_set_s1')
chunkify_df(df_norm.ix[:, ti(labels == 's2')], store.filename, 'in_set_s2')
chunkify_df(df_norm.ix[:, ti(labels == 's3')], store.filename, 'in_set_s3')

In [ ]:
store.close()
store.open()