Creating Feature Sets for HIV and Age



In [1]:

    
import os 
if os.getcwd().endswith('Parallel'):
    os.chdir('..')



In [2]:

    
import NotebookImport
from HIV_Age_Advancement import *









    




importing IPython notebook from HIV_Age_Advancement






    




importing IPython notebook from Setup/Imports






    



Populating the interactive namespace from numpy and matplotlib






    




importing IPython notebook from Setup/MethylationAgeModels






    




importing IPython notebook from Setup/Read_HIV_Data



In [3]:

    
def size_normalize(vec, order, s=10001):
    p = int(np.ceil(s / 2.) - 1)
    rank = lambda v: np.mean(v < v[p])
    vec = vec.ix[order.order().index].dropna()
    rolling_rank = pd.rolling_apply(vec, s, rank, center=True)
    return rolling_rank

Read in HIV Case Control Models



In [4]:

    
outdir = '/cellar/users/agross/Data/tmp'
p = outdir + '/hiv_consented/'
def read_models(suffix):
    df = pd.concat([pd.read_csv(p + f, index_col=0, header=[0,1]) for 
                    f in os.listdir(p) if f.endswith('{}.csv'.format(suffix))])
    return df



In [5]:

    
r4 = read_models('hiv_age_cc')



In [6]:

    
gg = bhCorrection(r4.HIV_LR.p.ix[probe_idx]) < .01
gg.value_counts()









    Out[6]:





False    391683
True      81361
dtype: int64

Read in Age Models

Here we went and ran this notebook
Now we need to unpack the data and combine it together



In [7]:

    
outdir = '/cellar/users/agross/Data/tmp'
tables = ['in_set_s1','in_set_s2','in_set_s3']

res = {}
for tp in tables:
    p = outdir + '/' + tp + '/'
    res[tp] = pd.concat([pd.read_csv(p + f, index_col=0, header=[0,1]) for 
                    f in os.listdir(p) if f.endswith('age_out.csv')])
res = pd.concat(res, axis=1)

p_vals = res.xs('age_LR', axis=1, level=1).xs('p', axis=1, level=1)
p_vals = p_vals.ix[probe_idx].dropna()
len(p_vals)









    Out[7]:





473044



In [8]:

    
def disorder_frac(direction, beta, test_set):
    ct = pd.crosstab(beta < .5, direction.ix[test_set]>0)
    return (1.*ct[1][1] + ct[0][0]) / ct.sum().sum()

Nested Screen for Age Associated Probes

First pass is Hannum et al. dataset
Next we look in the probes passing stage 1 for probes in the EPIC dataset
The two tiers is for power, we only need to correct in stage II for the number of probes passing stage I
I'm using a Benjamini-Hochberg adusted p-value cutoff of 0.01



In [9]:

    
rr = ti(bhCorrection(p_vals.in_set_s1) < .01)
len(rr)









    Out[9]:





61592



In [10]:

    
v = p_vals.in_set_s3.ix[rr]
r2 = ti(bhCorrection(v) < .01)
len(r2)









    Out[10]:





26927



In [11]:

    
rr = bhCorrection(p_vals.in_set_s1) < .01
rr = rr + pd.Series(1, index=r2).ix[rr.index].fillna(0)
rr.value_counts()









    Out[11]:





0    411452
1     34665
2     26927
dtype: int64

Size Normalize Datasets



In [12]:

    
mm = df_hiv.ix[:, ti(hiv == 'HIV-')].mean(1)



In [13]:

    
hiv_rolled = size_normalize(r4.HIV_LR.p, mm)
age_rolled = size_normalize(p_vals.in_set_s1, mm)

Form Feature Sets



In [14]:

    
g_age = bhCorrection(p_vals.in_set_s1.ix[probe_idx]) < .01
g_age_2 = pd.Series(True, r2).ix[probe_idx].fillna(False)
g_age = g_age_2
g_hiv = bhCorrection(r4.HIV_LR.p.ix[probe_idx]) < .01
combo = combine(g_age, g_hiv) == 'both'



In [15]:

    
#Do not import
fisher_exact_test(g_age, g_hiv)









    Out[15]:





odds_ratio    1.29e+00
p             3.88e-59
dtype: float64



In [16]:

    
combo.value_counts()









    Out[16]:





False    467413
True       5631
dtype: int64

Feature sets from rolling statistics



In [17]:

    
g_hiv_r = hiv_rolled.rank(ascending=True).dropna() < g_hiv.sum()
g_age_r = age_rolled.rank(ascending=True).dropna() < g_age.sum()

Nominal significance thresholds



In [18]:

    
g_hiv_nom = r4.HIV_LR.p < .05
g_age_nom = p_vals.in_set_s1 < .05

Nominal significance thresholds: rolled



In [19]:

    
hiv_nom_r = hiv_rolled.rank(ascending=True).dropna() < g_hiv_nom.sum()
age_nom_r = age_rolled.rank(ascending=True).dropna() < g_age_nom.sum()

Bonferroni adjusted thresholds



In [20]:

    
g_hiv_b = (r4.HIV_LR.ix[probe_idx].p * len(r4)) < .01
g_age_b = (p_vals.in_set_s1.ix[probe_idx] * len(p_vals)) < .01

Combinations



In [21]:

    
combo = combine(g_age, g_hiv) == 'both'
combo_b = combine(g_hiv_b, g_age_b) == 'both'
combo_r = combine(g_age_r, g_hiv_r) == 'both'



In [22]:

    
features = {'HIV (BH)': g_hiv & (g_age_nom == False), 
            'HIV (adj)': g_hiv_r & (age_nom_r == False), 
            'HIV (Bonf)': g_hiv_b & (g_age_nom == False), 
            'Age (BH)': g_age_2 & (g_hiv_nom == False), 
            'Age (adj)': g_age_r & (hiv_nom_r == False),
            'Age (Bonf)': g_age_b & (g_hiv_nom == False),
            'HIV + Age (BH)': combo, 
            'HIV + Age (adj)': combo_r,
            'HIV + Age (Bonf)': combo_b}



In [23]:

    
c1 = pd.DataFrame({i:v.value_counts() for i,v in features.iteritems()}).T
c1









    Out[23]:






  
    
      
      False
      True
    
  
  
    
      Age (BH)
      457710
      15334
    
    
      Age (Bonf)
      467844
      5200
    
    
      Age (adj)
      446957
      16087
    
    
      HIV (BH)
      421899
      51145
    
    
      HIV (Bonf)
      462149
      10895
    
    
      HIV (adj)
      423348
      52164
    
    
      HIV + Age (BH)
      467413
      5631
    
    
      HIV + Age (Bonf)
      472713
      331
    
    
      HIV + Age (adj)
      458003
      5041

Second pass to simplify things...



In [24]:

    
features = {'HIV (BH)': g_hiv & (g_age_2 == False), 
            'HIV (adj)': g_hiv_r & (age_nom_r == False), 
            'HIV (Bonf)': g_hiv_b & (g_age_b == False), 
            'Age (BH)': g_age_2 & (g_hiv == False), 
            'Age (adj)': g_age_r & (g_hiv == False),
            'Age (Bonf)': g_age_b & (g_hiv_b == False),
            'HIV + Age (BH)': combo, 
            'HIV + Age (adj)': combo_r,
            'HIV + Age (Bonf)': combo_b}



In [25]:

    
c1 = pd.DataFrame({i:v.value_counts() for i,v in features.iteritems()}).T
c1









    Out[25]:






  
    
      
      False
      True
    
  
  
    
      Age (BH)
      451748
      21296
    
    
      Age (Bonf)
      463978
      9066
    
    
      Age (adj)
      441226
      21818
    
    
      HIV (BH)
      397314
      75730
    
    
      HIV (Bonf)
      455339
      17705
    
    
      HIV (adj)
      423348
      52164
    
    
      HIV + Age (BH)
      467413
      5631
    
    
      HIV + Age (Bonf)
      472713
      331
    
    
      HIV + Age (adj)
      458003
      5041

	False	True
Age (BH)	457710	15334
Age (Bonf)	467844	5200
Age (adj)	446957	16087
HIV (BH)	421899	51145
HIV (Bonf)	462149	10895
HIV (adj)	423348	52164
HIV + Age (BH)	467413	5631
HIV + Age (Bonf)	472713	331
HIV + Age (adj)	458003	5041

	False	True
Age (BH)	451748	21296
Age (Bonf)	463978	9066
Age (adj)	441226	21818
HIV (BH)	397314	75730
HIV (Bonf)	455339	17705
HIV (adj)	423348	52164
HIV + Age (BH)	467413	5631
HIV + Age (Bonf)	472713	331
HIV + Age (adj)	458003	5041