In [2]:

    
cd /cellar/users/agross/TCGA_Code/Methlation









    



/cellar/users/agross/TCGA_Code/Methlation



In [3]:

    
import NotebookImport
from HIV_Age_Advancement import *









    




importing IPython notebook from HIV_Age_Advancement






    




importing IPython notebook from Setup/Imports






    



Populating the interactive namespace from numpy and matplotlib






    




importing IPython notebook from Setup/MethylationAgeModels






    




importing IPython notebook from Setup/Read_HIV_Data

Logit Transform and Normalize Methylation Data



In [7]:

    
df = df_hiv.ix[:, pred_c.index]



In [14]:

    
dd = logit_adj(df)
m = dd.ix[:, ti(duration == 'Control')].mean(1)
s = dd.ix[:, ti(duration == 'Control')].std(1)
df_norm = dd.subtract(m, axis=0).divide(s, axis=0)
df_norm = df_norm.clip(-7,7)



In [15]:

    
df_norm.shape









    Out[15]:





(485512, 178)

Prepare Data for Association Tests

The association tests take a while to run in serial so we do them in a map-reduce type format
The idea is we break the data into 100 chunks, run the tests in parallel, and then combine the results
This is not entirely necissary but drops run-time from ~15 min to about 15 seconds



In [16]:

    
def chunkify_df(df, store, table_name, N=100):
    df = df.dropna(1)
    for i in range(N):
        g = df.index[i::N]
        dd = df.ix[g]
        dd.to_hdf(store, '{}/chunk_{}'.format(table_name, i))



In [17]:

    
duration.ix[df_norm.columns].value_counts()









    Out[17]:





HIV Long     101
Control       40
HIV Short     33
dtype: int64



In [19]:

    
hiv.value_counts()









    Out[19]:





HIV+    137
HIV-     42
dtype: int64



In [31]:

    
store = '/cellar/users/agross/Data/tmp/for_parallel.h5'
store = pd.HDFStore(store)
(hiv == 'HIV+').ix[pred_c.index].to_hdf(store, 'HIV')
#store['bio_age'] = mc_adj_c
#store['cell_counts'] = cell_counts
#store['age'] = age
#store['gender'] = gender == 'M'
#store['bio_age'] = age_adv.append(age_adv0)



In [28]:

    
chunkify_df(df_norm, store.filename, 'hiv_consented')









    



/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:2441: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis0] [items->None]

  warnings.warn(ws, PerformanceWarning)
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:2441: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block0_items] [items->None]

  warnings.warn(ws, PerformanceWarning)



In [33]:

    
store.close()
store.open()



In [ ]: