In [1]:
import os
if os.getcwd().endswith('Parallel'):
os.chdir('..')
In [2]:
import NotebookImport
from HIV_Age_Advancement import *
In [3]:
def size_normalize(vec, order, s=10001):
p = int(np.ceil(s / 2.) - 1)
rank = lambda v: np.mean(v < v[p])
vec = vec.ix[order.order().index].dropna()
rolling_rank = pd.rolling_apply(vec, s, rank, center=True)
return rolling_rank
In [4]:
outdir = '/cellar/users/agross/Data/tmp'
p = outdir + '/hiv_consented/'
def read_models(suffix):
df = pd.concat([pd.read_csv(p + f, index_col=0, header=[0,1]) for
f in os.listdir(p) if f.endswith('{}.csv'.format(suffix))])
return df
In [5]:
r4 = read_models('hiv_age_cc')
In [6]:
gg = bhCorrection(r4.HIV_LR.p.ix[probe_idx]) < .01
gg.value_counts()
Out[6]:
In [7]:
outdir = '/cellar/users/agross/Data/tmp'
tables = ['in_set_s1','in_set_s2','in_set_s3']
res = {}
for tp in tables:
p = outdir + '/' + tp + '/'
res[tp] = pd.concat([pd.read_csv(p + f, index_col=0, header=[0,1]) for
f in os.listdir(p) if f.endswith('age_out.csv')])
res = pd.concat(res, axis=1)
p_vals = res.xs('age_LR', axis=1, level=1).xs('p', axis=1, level=1)
p_vals = p_vals.ix[probe_idx].dropna()
len(p_vals)
Out[7]:
In [8]:
def disorder_frac(direction, beta, test_set):
ct = pd.crosstab(beta < .5, direction.ix[test_set]>0)
return (1.*ct[1][1] + ct[0][0]) / ct.sum().sum()
In [9]:
rr = ti(bhCorrection(p_vals.in_set_s1) < .01)
len(rr)
Out[9]:
In [10]:
v = p_vals.in_set_s3.ix[rr]
r2 = ti(bhCorrection(v) < .01)
len(r2)
Out[10]:
In [11]:
rr = bhCorrection(p_vals.in_set_s1) < .01
rr = rr + pd.Series(1, index=r2).ix[rr.index].fillna(0)
rr.value_counts()
Out[11]:
In [12]:
mm = df_hiv.ix[:, ti(hiv == 'HIV-')].mean(1)
In [13]:
hiv_rolled = size_normalize(r4.HIV_LR.p, mm)
age_rolled = size_normalize(p_vals.in_set_s1, mm)
In [14]:
g_age = bhCorrection(p_vals.in_set_s1.ix[probe_idx]) < .01
g_age_2 = pd.Series(True, r2).ix[probe_idx].fillna(False)
g_age = g_age_2
g_hiv = bhCorrection(r4.HIV_LR.p.ix[probe_idx]) < .01
combo = combine(g_age, g_hiv) == 'both'
In [15]:
#Do not import
fisher_exact_test(g_age, g_hiv)
Out[15]:
In [16]:
combo.value_counts()
Out[16]:
Feature sets from rolling statistics
In [17]:
g_hiv_r = hiv_rolled.rank(ascending=True).dropna() < g_hiv.sum()
g_age_r = age_rolled.rank(ascending=True).dropna() < g_age.sum()
Nominal significance thresholds
In [18]:
g_hiv_nom = r4.HIV_LR.p < .05
g_age_nom = p_vals.in_set_s1 < .05
Nominal significance thresholds: rolled
In [19]:
hiv_nom_r = hiv_rolled.rank(ascending=True).dropna() < g_hiv_nom.sum()
age_nom_r = age_rolled.rank(ascending=True).dropna() < g_age_nom.sum()
Bonferroni adjusted thresholds
In [20]:
g_hiv_b = (r4.HIV_LR.ix[probe_idx].p * len(r4)) < .01
g_age_b = (p_vals.in_set_s1.ix[probe_idx] * len(p_vals)) < .01
Combinations
In [21]:
combo = combine(g_age, g_hiv) == 'both'
combo_b = combine(g_hiv_b, g_age_b) == 'both'
combo_r = combine(g_age_r, g_hiv_r) == 'both'
In [22]:
features = {'HIV (BH)': g_hiv & (g_age_nom == False),
'HIV (adj)': g_hiv_r & (age_nom_r == False),
'HIV (Bonf)': g_hiv_b & (g_age_nom == False),
'Age (BH)': g_age_2 & (g_hiv_nom == False),
'Age (adj)': g_age_r & (hiv_nom_r == False),
'Age (Bonf)': g_age_b & (g_hiv_nom == False),
'HIV + Age (BH)': combo,
'HIV + Age (adj)': combo_r,
'HIV + Age (Bonf)': combo_b}
In [23]:
c1 = pd.DataFrame({i:v.value_counts() for i,v in features.iteritems()}).T
c1
Out[23]:
Second pass to simplify things...
In [24]:
features = {'HIV (BH)': g_hiv & (g_age_2 == False),
'HIV (adj)': g_hiv_r & (age_nom_r == False),
'HIV (Bonf)': g_hiv_b & (g_age_b == False),
'Age (BH)': g_age_2 & (g_hiv == False),
'Age (adj)': g_age_r & (g_hiv == False),
'Age (Bonf)': g_age_b & (g_hiv_b == False),
'HIV + Age (BH)': combo,
'HIV + Age (adj)': combo_r,
'HIV + Age (Bonf)': combo_b}
In [25]:
c1 = pd.DataFrame({i:v.value_counts() for i,v in features.iteritems()}).T
c1
Out[25]: