Creating Feature Sets for HIV and Age


In [1]:
import os 
if os.getcwd().endswith('Parallel'):
    os.chdir('..')

In [2]:
import NotebookImport
from HIV_Age_Advancement import *


importing IPython notebook from HIV_Age_Advancement
importing IPython notebook from Setup/Imports
Populating the interactive namespace from numpy and matplotlib
importing IPython notebook from Setup/MethylationAgeModels
importing IPython notebook from Setup/Read_HIV_Data

In [3]:
def size_normalize(vec, order, s=10001):
    p = int(np.ceil(s / 2.) - 1)
    rank = lambda v: np.mean(v < v[p])
    vec = vec.ix[order.order().index].dropna()
    rolling_rank = pd.rolling_apply(vec, s, rank, center=True)
    return rolling_rank

Read in HIV Case Control Models


In [4]:
outdir = '/cellar/users/agross/Data/tmp'
p = outdir + '/hiv_consented/'
def read_models(suffix):
    df = pd.concat([pd.read_csv(p + f, index_col=0, header=[0,1]) for 
                    f in os.listdir(p) if f.endswith('{}.csv'.format(suffix))])
    return df

In [5]:
r4 = read_models('hiv_age_cc')

In [6]:
gg = bhCorrection(r4.HIV_LR.p.ix[probe_idx]) < .01
gg.value_counts()


Out[6]:
False    391683
True      81361
dtype: int64

Read in Age Models

  • Here we went and ran this notebook
  • Now we need to unpack the data and combine it together

In [7]:
outdir = '/cellar/users/agross/Data/tmp'
tables = ['in_set_s1','in_set_s2','in_set_s3']

res = {}
for tp in tables:
    p = outdir + '/' + tp + '/'
    res[tp] = pd.concat([pd.read_csv(p + f, index_col=0, header=[0,1]) for 
                    f in os.listdir(p) if f.endswith('age_out.csv')])
res = pd.concat(res, axis=1)

p_vals = res.xs('age_LR', axis=1, level=1).xs('p', axis=1, level=1)
p_vals = p_vals.ix[probe_idx].dropna()
len(p_vals)


Out[7]:
473044

In [8]:
def disorder_frac(direction, beta, test_set):
    ct = pd.crosstab(beta < .5, direction.ix[test_set]>0)
    return (1.*ct[1][1] + ct[0][0]) / ct.sum().sum()

Nested Screen for Age Associated Probes

  • First pass is Hannum et al. dataset
  • Next we look in the probes passing stage 1 for probes in the EPIC dataset
  • The two tiers is for power, we only need to correct in stage II for the number of probes passing stage I
  • I'm using a Benjamini-Hochberg adusted p-value cutoff of 0.01

In [9]:
rr = ti(bhCorrection(p_vals.in_set_s1) < .01)
len(rr)


Out[9]:
61592

In [10]:
v = p_vals.in_set_s3.ix[rr]
r2 = ti(bhCorrection(v) < .01)
len(r2)


Out[10]:
26927

In [11]:
rr = bhCorrection(p_vals.in_set_s1) < .01
rr = rr + pd.Series(1, index=r2).ix[rr.index].fillna(0)
rr.value_counts()


Out[11]:
0    411452
1     34665
2     26927
dtype: int64

Size Normalize Datasets


In [12]:
mm = df_hiv.ix[:, ti(hiv == 'HIV-')].mean(1)

In [13]:
hiv_rolled = size_normalize(r4.HIV_LR.p, mm)
age_rolled = size_normalize(p_vals.in_set_s1, mm)

Form Feature Sets


In [14]:
g_age = bhCorrection(p_vals.in_set_s1.ix[probe_idx]) < .01
g_age_2 = pd.Series(True, r2).ix[probe_idx].fillna(False)
g_age = g_age_2
g_hiv = bhCorrection(r4.HIV_LR.p.ix[probe_idx]) < .01
combo = combine(g_age, g_hiv) == 'both'

In [15]:
#Do not import
fisher_exact_test(g_age, g_hiv)


Out[15]:
odds_ratio    1.29e+00
p             3.88e-59
dtype: float64

In [16]:
combo.value_counts()


Out[16]:
False    467413
True       5631
dtype: int64

Feature sets from rolling statistics


In [17]:
g_hiv_r = hiv_rolled.rank(ascending=True).dropna() < g_hiv.sum()
g_age_r = age_rolled.rank(ascending=True).dropna() < g_age.sum()

Nominal significance thresholds


In [18]:
g_hiv_nom = r4.HIV_LR.p < .05
g_age_nom = p_vals.in_set_s1 < .05

Nominal significance thresholds: rolled


In [19]:
hiv_nom_r = hiv_rolled.rank(ascending=True).dropna() < g_hiv_nom.sum()
age_nom_r = age_rolled.rank(ascending=True).dropna() < g_age_nom.sum()

Bonferroni adjusted thresholds


In [20]:
g_hiv_b = (r4.HIV_LR.ix[probe_idx].p * len(r4)) < .01
g_age_b = (p_vals.in_set_s1.ix[probe_idx] * len(p_vals)) < .01

Combinations


In [21]:
combo = combine(g_age, g_hiv) == 'both'
combo_b = combine(g_hiv_b, g_age_b) == 'both'
combo_r = combine(g_age_r, g_hiv_r) == 'both'

In [22]:
features = {'HIV (BH)': g_hiv & (g_age_nom == False), 
            'HIV (adj)': g_hiv_r & (age_nom_r == False), 
            'HIV (Bonf)': g_hiv_b & (g_age_nom == False), 
            'Age (BH)': g_age_2 & (g_hiv_nom == False), 
            'Age (adj)': g_age_r & (hiv_nom_r == False),
            'Age (Bonf)': g_age_b & (g_hiv_nom == False),
            'HIV + Age (BH)': combo, 
            'HIV + Age (adj)': combo_r,
            'HIV + Age (Bonf)': combo_b}

In [23]:
c1 = pd.DataFrame({i:v.value_counts() for i,v in features.iteritems()}).T
c1


Out[23]:
False True
Age (BH) 457710 15334
Age (Bonf) 467844 5200
Age (adj) 446957 16087
HIV (BH) 421899 51145
HIV (Bonf) 462149 10895
HIV (adj) 423348 52164
HIV + Age (BH) 467413 5631
HIV + Age (Bonf) 472713 331
HIV + Age (adj) 458003 5041

Second pass to simplify things...


In [24]:
features = {'HIV (BH)': g_hiv & (g_age_2 == False), 
            'HIV (adj)': g_hiv_r & (age_nom_r == False), 
            'HIV (Bonf)': g_hiv_b & (g_age_b == False), 
            'Age (BH)': g_age_2 & (g_hiv == False), 
            'Age (adj)': g_age_r & (g_hiv == False),
            'Age (Bonf)': g_age_b & (g_hiv_b == False),
            'HIV + Age (BH)': combo, 
            'HIV + Age (adj)': combo_r,
            'HIV + Age (Bonf)': combo_b}

In [25]:
c1 = pd.DataFrame({i:v.value_counts() for i,v in features.iteritems()}).T
c1


Out[25]:
False True
Age (BH) 451748 21296
Age (Bonf) 463978 9066
Age (adj) 441226 21818
HIV (BH) 397314 75730
HIV (Bonf) 455339 17705
HIV (adj) 423348 52164
HIV + Age (BH) 467413 5631
HIV + Age (Bonf) 472713 331
HIV + Age (adj) 458003 5041