In [2]:
cd /cellar/users/agross/TCGA_Code/Methlation


/cellar/users/agross/TCGA_Code/Methlation

In [3]:
import NotebookImport
from HIV_Age_Advancement import *


importing IPython notebook from HIV_Age_Advancement
importing IPython notebook from Setup/Imports
Populating the interactive namespace from numpy and matplotlib
importing IPython notebook from Setup/MethylationAgeModels
importing IPython notebook from Setup/Read_HIV_Data

Logit Transform and Normalize Methylation Data


In [7]:
df = df_hiv.ix[:, pred_c.index]

In [14]:
dd = logit_adj(df)
m = dd.ix[:, ti(duration == 'Control')].mean(1)
s = dd.ix[:, ti(duration == 'Control')].std(1)
df_norm = dd.subtract(m, axis=0).divide(s, axis=0)
df_norm = df_norm.clip(-7,7)

In [15]:
df_norm.shape


Out[15]:
(485512, 178)

Prepare Data for Association Tests

  • The association tests take a while to run in serial so we do them in a map-reduce type format
  • The idea is we break the data into 100 chunks, run the tests in parallel, and then combine the results
  • This is not entirely necissary but drops run-time from ~15 min to about 15 seconds

In [16]:
def chunkify_df(df, store, table_name, N=100):
    df = df.dropna(1)
    for i in range(N):
        g = df.index[i::N]
        dd = df.ix[g]
        dd.to_hdf(store, '{}/chunk_{}'.format(table_name, i))

In [17]:
duration.ix[df_norm.columns].value_counts()


Out[17]:
HIV Long     101
Control       40
HIV Short     33
dtype: int64

In [19]:
hiv.value_counts()


Out[19]:
HIV+    137
HIV-     42
dtype: int64

In [31]:
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
store = pd.HDFStore(store)
(hiv == 'HIV+').ix[pred_c.index].to_hdf(store, 'HIV')
#store['bio_age'] = mc_adj_c
#store['cell_counts'] = cell_counts
#store['age'] = age
#store['gender'] = gender == 'M'
#store['bio_age'] = age_adv.append(age_adv0)

In [28]:
chunkify_df(df_norm, store.filename, 'hiv_consented')


/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:2441: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis0] [items->None]

  warnings.warn(ws, PerformanceWarning)
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:2441: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block0_items] [items->None]

  warnings.warn(ws, PerformanceWarning)

In [33]:
store.close()
store.open()

In [ ]: