Generate cluster by cluster matrix

Load the pre-computed linkage and then compute a cluster by cluster matrix of pheno distance and p-values. For now, we have these metrics to choose from:

mean
median



In [7]:

    
# Imports
import re
import gzip
import numpy as np
import pandas as pd
import cPickle as cp
import brainbox as bb
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as clh

Load the precomputed linkage from disk



In [18]:

    
# Path to the precomputed linkage
linkage_path = '/data1/abide/Test/linkage.gz'



In [20]:

    
# Load the precomputed linkage
f = gzip.open(linkage_path, 'rb')
in_data = cp.load(f)



In [21]:

    
results = in_data[1]
data_subs = in_data[]

Visualize the structure of the data



In [65]:

    
f = plt.figure(figsize=(12,4))
ax1 = f.add_subplot(121)
D1 = clh.dendrogram(results[0][1], p=40, truncate_mode='lastp', ax=ax1, color_threshold=500)
dump = ax1.set_xticks([])

ax2 = f.add_subplot(122)
D1 = clh.dendrogram(results[0][1], ax=ax2, color_threshold=500)
dump = ax2.set_xticks([])

Decide on the scale that I want to cluster this network at



In [68]:

    
# Get the partition of the linkage
scale = 3
part = clh.fcluster(results[0][1], scale, criterion='maxclust')

Get the pheno data for the entire dataset and limit it to the subjects that are in this sample



In [67]:

    
# Grab the phenotype data
pheno_path = '/home/surchs/Project/abide/pheno/pheno_full.csv'
pheno = pd.read_csv(pheno_path)
# Get the subject IDs of the pheno files I just read in
pheno_subs = pheno['SUB_ID']
# Find a mask of those pheno subs for which we have brain data
pheno_mask = pheno_subs.isin(data_subs)
# Get the correct pheno data
pheno_data = pheno[pheno_mask]



In [69]:

    
results[0][0].shape









    Out[69]:





(607, 607)



In [70]:

    
len(pheno_data)









    Out[70]:





582



In [16]:



In [13]:



In [24]:

    
def show_netw(results, network, scale):
    distance, linkage = results[network - 1]
    part = clh.fcluster(linkage, scale, criterion='maxclust')

    # Define covariates of interest
    cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']

    # Pull up the subjects for one cluster
    f = plt.figure(figsize=(10,5*(scale + 1)))

    for clust in np.arange(1,scale + 1):
        clust_subs = data_subs[part == clust]
        clust_pheno = pheno_data[part == clust]
        ax_cl = f.add_subplot(scale, 2, clust)
        ax_cl.set_xticks([])
        ax_cl.set_title('Cluster {}'.format(clust))

        lt, lb, rt, rb = bb.visuOps.add_four_grid(ax_cl, ticks=True, titles=('age', 'sex', 'dx', 'fiq'))
        lt.hist(clust_pheno['AGE_AT_SCAN'].values)
        lb.hist(clust_pheno['SEX'].values, bins=2)
        rt.hist(clust_pheno['DX_GROUP'].values, bins=2)
        rb.hist(clust_pheno['FIQ'].values)



In [ ]: