In [1]:
import re
import sys
import pandas as pd
import brainbox as bb
from matplotlib import pyplot as plt
import scipy.spatial.distance as dist
import scipy.cluster.hierarchy as clh

In [2]:
in_path = '/data1/abide/Out/Remote/some_failed/out'
out_path = '/data1/abide/Analysis/Remote/some_missing'
metric = 'stability_maps'

In [5]:
file_dict = bb.fileOps.grab_files(in_path, '.nii.gz', sub=metric)


I will be pulling files from /data1/abide/Out/Remote/some_failed/out/stability_maps

In [60]:
array_dict = bb.dataOps.read_files(file_dict)


I found 607 files to load.
 100.0 % done 0.00 seconds to go.
We are done

Compute the similarity of network metrics among the subjects for all network

This could be done in parallel if Marc gives me more memory


In [61]:
result_list = []
for n_id in range(10):
    sys.stdout.write('\rRunning network {}.'.format(n_id))
    sys.stdout.flush()
    
    eucl = dist.squareform(dist.pdist(array_dict[metric][..., n_id], 'euclidean'))
    Y = clh.linkage(eucl, method='ward')
    result_list.append((eucl, Y))


Running network 9.

Visualize the things


In [8]:
f = plt.figure(figsize=(18, 50))
order_net = None
for n_id in range(10):
    n_id += 1
    sys.stdout.write('\rRunning network {}.'.format(n_id))
    sys.stdout.flush()
    (eucl, Y) = result_list[n_id-1]
    ax = f.add_subplot(5,2,n_id)
    subdend = bb.visuOps.add_subplot_axes(ax, [0, 0.71, 1, 0.29])
    submat = bb.visuOps.add_subplot_axes(ax, [0, 0, 1, 0.7])
    if order_net:
        Y = result_list[order_net][1]
    
    Z1 = clh.dendrogram(Y, ax=subdend)
    if order_net:
        subdend.matshow(np.ones((100,20)), aspect='auto')

    idx = Z1['leaves']
    tmp = eucl[idx, :]
    D = tmp[:, idx]
    a = submat.matshow(-D, aspect='auto')
    submat.set_xticks([])
    submat.set_yticks([])
    subdend.set_xticks([])
    subdend.set_yticks([])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title('Network {}'.format(n_id))
f.suptitle('Stability Maps')


Running network 10.
Out[8]:
<matplotlib.text.Text at 0x13b7af50>

Grab the phenotype data


In [62]:
pheno_path = '/home/surchs/Project/abide/pheno/pheno_full.csv'
pheno = pd.read_csv(pheno_path)

In [63]:
# Get the real sub ids from the file names
sub_ids = np.array([int64(re.search(r'(?<=\d{2})\d{5}', sub_id).group()) for sub_id in file_dict['sub_name']])

In [78]:
# Get the relevant section of the pheno data
this_pheno = pheno[pheno['SUB_ID'].isin(sub_ids)]

Find a specific number of clusters and investigate them


In [66]:
# Define the network
netw = 8
# Build the distance metric
eucl = dist.squareform(dist.pdist(array_dict[metric][..., netw], 'euclidean'))
Y = clh.linkage(eucl, method='ward')
# Make clusters out of the hierarchy
fl = clh.fcluster(Y,10,criterion='maxclust')

In [80]:
sub_mask = [x in this_pheno['SUB_ID'].values for x in sub_ids]

In [93]:
len(np.unique(sub_ids))


Out[93]:
582

In [68]:
phens = this_pheno['SEX'].values
f = plt.figure()
for cl_id in range(1,10):
    vals = phens[fl==cl_id]
    ax = f.add_subplot(2,5,cl_id)
    ax.hist(vals)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-68-0c754cb151bb> in <module>()
      2 f = plt.figure()
      3 for cl_id in range(1,10):
----> 4     vals = phens[fl==cl_id]
      5     ax = f.add_subplot(2,5,cl_id)
      6     ax.hist(vals)

ValueError: too many boolean indices
<matplotlib.figure.Figure at 0x5812a10>

In [56]:
# Define the cluster
clust = 1
# Get the subjects belonging to this cluster
cl_subs = sub_ids[fl == clust]
cl_phen = pheno[pheno['SUB_ID'].isin(cl_subs)]

In [65]:
cl_phen[['AGE_AT_SCAN', 'FIQ']]


Out[65]:
AGE_AT_SCAN FIQ
8 20.9000 101
16 17.5000 125
23 20.0000 111
26 21.1000 103
28 23.6000 128
38 33.0000 103
48 24.0000 118
52 28.0000 129
57 20.0000 124
64 30.0000 126
68 11.0000 101
71 9.7300 122
74 9.3000 120
77 8.0700 115
80 10.8200 117
96 8.4500 88
98 11.3300 95
101 11.0900 125
103 11.8500 117
111 11.7900 98
114 11.1700 114
115 12.4300 98
117 10.2900 109
121 24.0000 106
130 22.0000 128
131 19.0000 109
157 16.2000 NaN
158 12.4000 NaN
164 12.8000 NaN
165 15.2000 NaN
... ... ...
761 17.7900 95
764 13.4600 107
766 14.0300 108
767 11.2500 118
775 15.7400 106
779 13.8200 106
780 9.5000 109
785 13.0800 92
793 13.2800 91
798 11.6600 119
799 12.1500 128
807 13.6300 118
809 12.8200 102
960 14.9432 138
967 24.9884 100
969 27.3238 116
970 12.9391 130
971 10.4668 144
977 18.8008 111
982 18.1547 111
998 21.6482 117
1001 20.1780 91
1003 29.0897 127
1004 17.1663 88
1006 21.4346 96
1013 19.6277 110
1021 21.1307 99
1025 32.9582 90
1027 17.6454 87
1044 18.4175 80

163 rows × 2 columns

Some random testing


In [37]:
# Make a random template clustering
rand_clust = np.random.randint(1, 5, len(this_pheno))
cov_interest = ['DX_GROUP', 'AGE_AT_SCAN', 'SITE_ID', 'SEX', 'EYE_STATUS_AT_SCAN']

In [10]:
sub_1 = sub_ids[rand_clust==1]

In [58]:
a = this_pheno['SEX']

In [59]:
f = plt.figure()
for cl_id in range(1,10):
    vals = a.values[rand_clust==cl_id]
    ax = f.add_subplot(2,5,cl_id)
    ax.hist(vals)



In [51]:


In [52]:



Out[52]:
array([1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [ ]: