In [16]:
# Imports
import re
import os
import copy
import gzip
import numpy as np
import pandas as pd
import cPickle as cp
import brainbox as bb
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as clh
In [17]:
# Paths
in_path = '/data1/abide/Out/Remote/some_failed/out/'
out_path = '/data1/abide/Test'
out_name = 'nondupl_linkage.gz'
if not os.path.isdir(out_path):
try:
os.makedirs(out_path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(out_path):
pass
else: raise
metric = 'stability_maps'
file_dict = bb.fileOps.grab_files(in_path, '.nii.gz', metric)
In [18]:
# Duplicate dropper
def drop_duplicates(in_dict):
"""
Because python uses pointers and does not copy the variables
I can operate directly on the dictionary and change it in place
"""
cp_dict = copy.deepcopy(in_dict)
subs = cp_dict['sub_name']
dirs = cp_dict['dir']
path = cp_dict['path']
drop = list()
present = list()
sub_names = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in cp_dict['sub_name']])
for idx, sub in enumerate(sub_names):
if not sub in present:
present.append(sub)
else:
drop.append(idx)
print('Found {} items to drop'.format(len(drop)))
# Pop them in reverse order
for idx in drop[::-1]:
subs.pop(idx)
dirs.pop(idx)
path.pop(idx)
return cp_dict
In [19]:
# Visualizer
def show_netw(results, network, scale):
distance, linkage = results[network - 1]
part = clh.fcluster(linkage, scale, criterion='maxclust')
# Define covariates of interest
cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']
# Pull up the subjects for one cluster
f = plt.figure(figsize=(10,5*(scale + 1)))
for clust in np.arange(1,scale + 1):
clust_subs = data_subs[part == clust]
clust_pheno = pheno_data[part == clust]
ax_cl = f.add_subplot(scale, 2, clust)
ax_cl.set_xticks([])
ax_cl.set_title('Cluster {}'.format(clust))
lt, lb, rt, rb = bb.visuOps.add_four_grid(ax_cl, ticks=True, titles=('age', 'sex', 'dx', 'fiq'))
lt.hist(clust_pheno['AGE_AT_SCAN'].values)
lb.hist(clust_pheno['SEX'].values, bins=2)
rt.hist(clust_pheno['DX_GROUP'].values, bins=2)
rb.hist(clust_pheno['FIQ'].values)
In [20]:
# Loop the data
out_file = os.path.join(out_path, out_name)
if os.path.isfile(out_file):
print('{} does already exist, I will load things'.format(out_file))
f = gzip.open(out_file, 'rb')
in_data = cp.load(f)
data_subs = in_data[0]
results = in_data[1]
else:
# Remove the duplicates from the file dict
print('I will run things now')
file_dict = drop_duplicates(file_dict)
results = list()
for network in np.arange(10):
print('Loading network {}'.format(network))
data_dict = bb.fileOps.read_files(file_dict, network=network, silence=True)
# We only loaded one network so it is always the first one
results.append(bb.dataOps.calc_link(data_dict, metric, network=0))
# Write the results to disk for future use
data_subs = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in file_dict['sub_name']])
out = (data_subs, results)
outF = gzip.open(out_file, 'wb')
cp.dump(out, outF, protocol=2)
outF.close()
In [21]:
# Visualize the distance matrix and dendrogram
f = plt.figure(figsize=(10,30))
for nw in np.arange(1,11):
distance, linkage = results[nw-1]
spl = f.add_subplot(5,2,nw)
spl.set_xticks([])
spl.set_yticks([])
subdend = bb.visuOps.add_subplot_axes(spl, [0, 0.71, 1, 0.29])
submat = bb.visuOps.add_subplot_axes(spl, [0, 0, 1, 0.7])
D1 = clh.dendrogram(linkage, ax=subdend)
subdend.set_xticks([])
D2 = submat.matshow(-distance, aspect='auto')
submat.set_xticks([])
spl.set_title('Network {}'.format(nw))
In [22]:
# Grab the phenotype data
pheno_path = '/home/surchs/Project/abide/pheno/pheno_full.csv'
pheno = pd.read_csv(pheno_path)
# Get the subject IDs of the pheno files I just read in
pheno_subs = pheno['SUB_ID']
# Find a mask of those pheno subs for which we have brain data
pheno_mask = pheno_subs.isin(data_subs)
# Get the correct pheno data
pheno_data = pheno[pheno_mask]
In [23]:
# Define covariates of interest
cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']
In [24]:
# Initialize to scale sc
sc = 3
scales = np.ones(10)*sc
partitions = np.zeros((len(pheno_data), 10))
for n_id in range(10):
distance, linkage = results[n_id]
scale = scales[n_id]
part = clh.fcluster(linkage, scale, criterion='maxclust')
partitions[:, n_id] = part
In [25]:
# Compare between clusters
cov = cov_interest[1]
reps = 1e5
#for n_id in range(10):
n_id = 1
part = partitions[:, n_id]
scale = np.max(part)
dist_mat = np.zeros((scale, scale))
p_mat = np.zeros((scale, scale))
# Get coordinate pairs for the upper triangle so we do each comparison only once
cols, rows = np.triu_indices(scale,1)
In [26]:
for idx in range(len(cols)):
col = cols[idx]
row = rows[idx]
clus_a = col+1
clus_b = row+1
# Notification
print('\nRunning clust {} and clust {}'.format(clus_a, clus_b))
# Get cluster a
sub_a = data_subs[part == clus_a]
pheno_a = pheno_data[part == clus_a]
var_a = pheno_a[cov].values
# Get cluster b
sub_b = data_subs[part == clus_b]
pheno_b = pheno_data[part == clus_b]
var_b = pheno_b[cov].values
# Compute the distance
dis = np.mean(var_a) - np.mean(var_b)
dist_mat[col, row] = dis
# Concatenate the two
com_a_b = np.concatenate((var_a[...,None], var_b[...,None]),axis=0)
# Resample stuff
res = np.empty(reps)
count = bb.tools.Counter(len(res))
for i in range(len(res)):
count.tic()
tmp = np.random.permutation(com_a_b)
# Slice out new randomized labels for both clusters
tmp_a = tmp[:len(var_a)]
tmp_b = tmp[len(var_a):]
# Take the mean of the distance (could go KGS on this)
tmp_dist = np.mean(tmp_a) - np.mean(tmp_b)
res[i] = tmp_dist
count.toc()
count.progress()
# Compute the p-value for the distance
p_mat[col, row] = (np.sum(np.abs(res) > np.abs(dis)) + 1) / float(reps + 1)
In [ ]: