In [1]:
# Imports
import re
import os
import copy
import gzip
import numpy as np
import pandas as pd
import cPickle as cp
import brainbox as bb
import multiprocessing as mp
from scipy import stats as st
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as clh
from mpl_toolkits.axes_grid1 import make_axes_locatable
In [2]:
# Paths
in_path = '/data1/abide/Out/Remote/some_failed/out/'
out_path = '/data1/abide/Test'
out_name = 'nondupl_linkage.gz'
if not os.path.isdir(out_path):
try:
os.makedirs(out_path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(out_path):
pass
else: raise
metric = 'stability_maps'
file_dict = bb.fileOps.grab_files(in_path, '.nii.gz', metric)
In [3]:
# Duplicate dropper
def drop_duplicates(in_dict):
"""
Because python uses pointers and does not copy the variables
I can operate directly on the dictionary and change it in place
"""
cp_dict = copy.deepcopy(in_dict)
subs = cp_dict['sub_name']
dirs = cp_dict['dir']
path = cp_dict['path']
drop = list()
present = list()
sub_names = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in cp_dict['sub_name']])
for idx, sub in enumerate(sub_names):
if not sub in present:
present.append(sub)
else:
drop.append(idx)
print('Found {} items to drop'.format(len(drop)))
# Pop them in reverse order
for idx in drop[::-1]:
subs.pop(idx)
dirs.pop(idx)
path.pop(idx)
return cp_dict
In [4]:
# Visualizer
def show_netw(results, network, scale):
distance, linkage = results[network - 1]
part = clh.fcluster(linkage, scale, criterion='maxclust')
# Define covariates of interest
cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']
# Pull up the subjects for one cluster
f = plt.figure(figsize=(10,5*(scale + 1)))
for clust in np.arange(1,scale + 1):
clust_subs = data_subs[part == clust]
clust_pheno = pheno_data[part == clust]
ax_cl = f.add_subplot(scale, 2, clust)
ax_cl.set_xticks([])
ax_cl.set_title('Cluster {}'.format(clust))
lt, lb, rt, rb = bb.visuOps.add_four_grid(ax_cl, ticks=True, titles=('age', 'sex', 'dx', 'fiq'))
lt.hist(clust_pheno['AGE_AT_SCAN'].values)
lb.hist(clust_pheno['SEX'].values, bins=2)
rt.hist(clust_pheno['DX_GROUP'].values, bins=2)
rb.hist(clust_pheno['FIQ'].values)
In [5]:
# Loop the data
out_file = os.path.join(out_path, out_name)
if os.path.isfile(out_file):
print('{} does already exist, I will load things'.format(out_file))
f = gzip.open(out_file, 'rb')
in_data = cp.load(f)
data_subs = in_data[0]
results = in_data[1]
else:
# Remove the duplicates from the file dict
print('I will run things now')
file_dict = drop_duplicates(file_dict)
results = list()
for network in np.arange(10):
print('Loading network {}'.format(network))
data_dict = bb.fileOps.read_files(file_dict, network=network, silence=True)
# We only loaded one network so it is always the first one
results.append(bb.dataOps.calc_link(data_dict, metric, network=0))
# Write the results to disk for future use
data_subs = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in file_dict['sub_name']])
out = (data_subs, results)
outF = gzip.open(out_file, 'wb')
cp.dump(out, outF, protocol=2)
outF.close()
In [73]:
# Visualize the distance matrix and dendrogram
f = plt.figure(figsize=(10,30))
for nw in np.arange(1,11):
distance, linkage = results[nw-1]
spl = f.add_subplot(5,2,nw)
spl.set_xticks([])
spl.set_yticks([])
subdend = bb.visuOps.add_subplot_axes(spl, [0, 0.71, 1, 0.29])
submat = bb.visuOps.add_subplot_axes(spl, [0, 0, 1, 0.7])
D1 = clh.dendrogram(linkage, ax=subdend, color_threshold=500)
subdend.set_xticks([])
D2 = submat.matshow(-distance, aspect='auto')
submat.set_xticks([])
spl.set_title('Network {}'.format(nw))
In [6]:
# Grab the phenotype data
pheno_path = '/home/surchs/Project/abide/pheno/pheno_full.csv'
pheno = pd.read_csv(pheno_path)
# Get the subject IDs of the pheno files I just read in
pheno_subs = pheno['SUB_ID']
# Find a mask of those pheno subs for which we have brain data
pheno_mask = pheno_subs.isin(data_subs)
# Get the correct pheno data
pheno_data = pheno[pheno_mask]
In [7]:
# Define covariates of interest
cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']
In [8]:
def permute(args):
col, row, data_subs, pheno_data, part, cov = args
clus_a = col+1
clus_b = row+1
# Get values
scale = np.max(part)
dist_mat = np.zeros((scale, scale))
p_mat = np.zeros((scale, scale))
# Get cluster a
sub_a = data_subs[part == clus_a]
pheno_a = pheno_data[part == clus_a]
var_a = pheno_a[cov].values
# Get cluster b
sub_b = data_subs[part == clus_b]
pheno_b = pheno_data[part == clus_b]
var_b = pheno_b[cov].values
# Compute the distance
dis = np.mean(var_a) - np.mean(var_b)
dist_mat[col, row] = np.abs(dis)
# Concatenate the two
com_a_b = np.concatenate((var_a[...,None], var_b[...,None]),axis=0)
# Resample stuff
res = np.empty(reps)
for i in range(len(res)):
tmp = np.random.permutation(com_a_b)
# Slice out new randomized labels for both clusters
tmp_a = tmp[:len(var_a)]
tmp_b = tmp[len(var_a):]
# Take the mean of the distance (could go KGS on this)
tmp_dist = np.mean(tmp_a) - np.mean(tmp_b)
res[i] = tmp_dist
# Compute the p-value for the distance
p_mat[col, row] = (np.sum(np.abs(res) > np.abs(dis)) + 1) / float(reps + 1)
return dist_mat, p_mat
In [ ]:
def test(args):
col, row, data_subs, pheno_data, part, cov = args
clus_a = col+1
clus_b = row+1
# Get values
scale = np.max(part)
dist_mat = np.zeros((scale, scale))
p_mat = np.zeros((scale, scale))
# Get cluster a
sub_a = data_subs[part == clus_a]
pheno_a = pheno_data[part == clus_a]
var_a = pheno_a[cov].values
# Get cluster b
sub_b = data_subs[part == clus_b]
pheno_b = pheno_data[part == clus_b]
var_b = pheno_b[cov].values
#KGS
ks, p = st.ks_2samp(var_a, var_b)
p_mat[col, row] = p
return p_mat
In [9]:
# Initialize to scale sc
sc = 3
#scales = np.ones(10)*sc
scales = np.array([6, 7, 5, 5, 9, 7, 9, 9, 11, 7])
partitions = np.zeros((len(pheno_data), 10))
dist_list = list()
pval_list = list()
cov = cov_interest[1]
reps = 1e5
for n_id in range(10):
print('Running network {} now'.format(n_id))
distance, linkage = results[n_id]
scale = scales[n_id]
part = clh.fcluster(linkage, scale, criterion='maxclust')
partitions[:, n_id] = part
scale = np.max(part)
# Get coordinate pairs for the upper triangle so we do each comparison only once
cols, rows = np.triu_indices(scale,1)
arg_list = list()
for idx in range(len(cols)):
col = cols[idx]
row = rows[idx]
args = (col, row, data_subs, pheno_data, part, cov)
arg_list.append(args)
use_perc = 0.80
ava = int(np.floor(use_perc*mp.cpu_count()))
pool = mp.Pool(processes=ava)
pll_res = pool.map(permute, arg_list)
# Stack the stuff together again
dist = np.zeros((scale, scale))
pval = np.zeros((scale, scale))
for result in pll_res:
dist += result[0]
pval += result[1]
# And copy over the diagonal
dist[rows, cols] = dist[cols, rows]
pval[rows, cols] = pval[cols, rows]
dist_list.append(dist)
pval_list.append(pval)
In [ ]:
f = plt.figure(figsize=(16,80))
for idx, n_id in enumerate(np.linspace(1,19,10)):
dist = np.abs(dist_list[idx])
val_m = np.max(np.abs(dist))
pval = pval_list[idx]
ax = f.add_subplot(10,2,n_id)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.05)
mp1 = ax.matshow(dist, cmap=plt.cm.Blues, vmin=0, vmax=12)
ax.set_title('distance network {}'.format(idx))
ax2 = f.add_subplot(10,2,n_id+1)
divider = make_axes_locatable(ax2)
cax2 = divider.append_axes("right", size="5%", pad=0.05)
mp2 = ax2.matshow(-np.log10(pval),cmap=plt.cm.Purples, vmin=0, vmax=3)
ax2.set_title('p-value log10 network {}'.format(idx))
f.colorbar(mp1, cax=cax)
plt.colorbar(mp2, cax=cax2)
In [147]:
np.linspace(1,19,10)
Out[147]:
In [107]:
plt.hist(-np.log10(pval[cols, rows]), bins=20)
Out[107]:
In [20]:
# Pull up the subjects for one cluster
f = plt.figure(figsize=(30,10))
for clust in np.arange(1,4):
clust_subs = data_subs[part == clust]
clust_pheno = pheno_data[part == clust]
ax_cl = f.add_subplot(1, 3, clust)
ax_cl.set_xticks([])
ax_cl.set_title('Cluster {}'.format(clust))
lt, lb, rt, rb = bb.visuOps.add_four_grid(ax_cl, ticks=True, titles=('age', 'sex', 'dx', 'fiq'))
lt.hist(clust_pheno['AGE_AT_SCAN'].values)
lb.hist(clust_pheno['SEX'].values)
rt.hist(clust_pheno['DX_GROUP'].values)
rb.hist(clust_pheno['FIQ'].values)
In [88]:
a = np.zeros((scale, scale))
for idx in arange(len(cols)):
a[cols[idx],rows[idx]] += 1
In [117]:
Out[117]:
In [ ]:
a = 10
In [14]:
In [14]:
In [ ]: