Compare pheno profiles of clusters


In [16]:
# Imports
import re
import os
import copy
import gzip
import numpy as np
import pandas as pd
import cPickle as cp
import brainbox as bb
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as clh

Load files and screen for duplicates


In [17]:
# Paths
in_path = '/data1/abide/Out/Remote/some_failed/out/'
out_path = '/data1/abide/Test'
out_name = 'nondupl_linkage.gz'
if not os.path.isdir(out_path):
    try:
        os.makedirs(out_path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(out_path):
            pass
        else: raise

metric = 'stability_maps'
file_dict = bb.fileOps.grab_files(in_path, '.nii.gz', metric)


I will be pulling files from /data1/abide/Out/Remote/some_failed/out/stability_maps

In [18]:
# Duplicate dropper
def drop_duplicates(in_dict):
    """
    Because python uses pointers and does not copy the variables
    I can operate directly on the dictionary and change it in place
    """
    cp_dict = copy.deepcopy(in_dict)
    subs = cp_dict['sub_name']
    dirs = cp_dict['dir']
    path = cp_dict['path']
    drop = list()
    present = list()
    sub_names = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in cp_dict['sub_name']])
    for idx, sub in enumerate(sub_names):
        if not sub in present:
            present.append(sub)
        else:
            drop.append(idx)
    print('Found {} items to drop'.format(len(drop)))
    # Pop them in reverse order
    for idx in drop[::-1]:
        subs.pop(idx)
        dirs.pop(idx)
        path.pop(idx)
    
    return cp_dict

In [19]:
# Visualizer
def show_netw(results, network, scale):
    distance, linkage = results[network - 1]
    part = clh.fcluster(linkage, scale, criterion='maxclust')

    # Define covariates of interest
    cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']

    # Pull up the subjects for one cluster
    f = plt.figure(figsize=(10,5*(scale + 1)))

    for clust in np.arange(1,scale + 1):
        clust_subs = data_subs[part == clust]
        clust_pheno = pheno_data[part == clust]
        ax_cl = f.add_subplot(scale, 2, clust)
        ax_cl.set_xticks([])
        ax_cl.set_title('Cluster {}'.format(clust))

        lt, lb, rt, rb = bb.visuOps.add_four_grid(ax_cl, ticks=True, titles=('age', 'sex', 'dx', 'fiq'))
        lt.hist(clust_pheno['AGE_AT_SCAN'].values)
        lb.hist(clust_pheno['SEX'].values, bins=2)
        rt.hist(clust_pheno['DX_GROUP'].values, bins=2)
        rb.hist(clust_pheno['FIQ'].values)

In [20]:
# Loop the data
out_file = os.path.join(out_path, out_name)
if os.path.isfile(out_file):
    print('{} does already exist, I will load things'.format(out_file))
    f = gzip.open(out_file, 'rb')
    in_data = cp.load(f)
    data_subs = in_data[0]
    results = in_data[1]
    
else:
    # Remove the duplicates from the file dict
    print('I will run things now')
    file_dict = drop_duplicates(file_dict)

    results = list()
    for network in np.arange(10):
        print('Loading network {}'.format(network))
        data_dict = bb.fileOps.read_files(file_dict, network=network, silence=True)
        # We only loaded one network so it is always the first one
        results.append(bb.dataOps.calc_link(data_dict, metric, network=0))
    # Write the results to disk for future use
    data_subs = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in file_dict['sub_name']])
    out = (data_subs, results)
    outF = gzip.open(out_file, 'wb')
    cp.dump(out, outF, protocol=2)
    outF.close()


/data1/abide/Test/nondupl_linkage.gz does already exist, I will load things

Visualize the data of all networks for this metric


In [21]:
# Visualize the distance matrix and dendrogram
f = plt.figure(figsize=(10,30))
for nw in np.arange(1,11):
    distance, linkage = results[nw-1]
    spl = f.add_subplot(5,2,nw)
    spl.set_xticks([])
    spl.set_yticks([])
    
    subdend = bb.visuOps.add_subplot_axes(spl, [0, 0.71, 1, 0.29])
    submat = bb.visuOps.add_subplot_axes(spl, [0, 0, 1, 0.7])
    D1 = clh.dendrogram(linkage, ax=subdend)
    subdend.set_xticks([])

    D2 = submat.matshow(-distance, aspect='auto')
    submat.set_xticks([])
    
    spl.set_title('Network {}'.format(nw))


Load pheno data


In [22]:
# Grab the phenotype data
pheno_path = '/home/surchs/Project/abide/pheno/pheno_full.csv'
pheno = pd.read_csv(pheno_path)
# Get the subject IDs of the pheno files I just read in
pheno_subs = pheno['SUB_ID']
# Find a mask of those pheno subs for which we have brain data
pheno_mask = pheno_subs.isin(data_subs)
# Get the correct pheno data
pheno_data = pheno[pheno_mask]

In [23]:
# Define covariates of interest
cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']

Make a subject level partition of each network and compare between clusters


In [24]:
# Initialize to scale sc
sc = 3
scales = np.ones(10)*sc
partitions = np.zeros((len(pheno_data), 10))
for n_id in range(10):
    distance, linkage = results[n_id]
    scale = scales[n_id]
    part = clh.fcluster(linkage, scale, criterion='maxclust')
    partitions[:, n_id] = part

In [25]:
# Compare between clusters
cov = cov_interest[1]
reps = 1e5
#for n_id in range(10):
n_id = 1
part = partitions[:, n_id]
scale = np.max(part)
dist_mat = np.zeros((scale, scale))
p_mat = np.zeros((scale, scale))
# Get coordinate pairs for the upper triangle so we do each comparison only once
cols, rows = np.triu_indices(scale,1)

In [26]:
for idx in range(len(cols)):
    col = cols[idx]
    row = rows[idx]
    clus_a = col+1
    clus_b = row+1
    # Notification
    print('\nRunning clust {} and clust {}'.format(clus_a, clus_b))
    # Get cluster a 
    sub_a =  data_subs[part == clus_a]
    pheno_a = pheno_data[part == clus_a]
    var_a = pheno_a[cov].values
    # Get cluster b
    sub_b =  data_subs[part == clus_b]
    pheno_b = pheno_data[part == clus_b]
    var_b = pheno_b[cov].values
    # Compute the distance
    dis = np.mean(var_a) - np.mean(var_b)
    dist_mat[col, row] = dis
    # Concatenate the two
    com_a_b = np.concatenate((var_a[...,None], var_b[...,None]),axis=0)
    # Resample stuff
    res = np.empty(reps)
    count = bb.tools.Counter(len(res))
    for i in range(len(res)):
        count.tic()
        tmp = np.random.permutation(com_a_b)
        # Slice out new randomized labels for both clusters
        tmp_a = tmp[:len(var_a)]
        tmp_b = tmp[len(var_a):]
        # Take the mean of the distance (could go KGS on this)
        tmp_dist = np.mean(tmp_a) - np.mean(tmp_b)
        res[i] = tmp_dist
        count.toc()
        count.progress()
    # Compute the p-value for the distance
    p_mat[col, row] = (np.sum(np.abs(res) > np.abs(dis)) + 1) / float(reps + 1)


Running clust 1 and clust 2
 100.0 % done 0.00 seconds to go. One step takes 0.00059 and we ran for 86.92 so far

In [ ]: