In [1]:
import itertools
import glob
import os
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec
import nibabel as nib
import numpy as np
import pandas as pd
import seaborn as sns
import hd_classifier
In [2]:
participants_df = pd.read_csv('data/participants.tsv', delimiter='\t')
participants_df
Out[2]:
We also set some colors for the groups.
In [3]:
subject_colors = {'HD': 'r', 'pre-HD': 'g', 'control': 'b'}
We extract all voxels for each subject and region directly from the nifti images and the masks generated from Freesurfer.
We use principal component analysis to transform each subject into the vector space spanned by the eigenvectors of the covariance matrix. The plots show the components along each axis in this subspace for each patient. The axis are ordered according to the value of the eigenvectors.
We choose a few statistical quantities to describe each region
In [4]:
q_1 = lambda x: np.percentile(x, q=25)
q_3 = lambda x: np.percentile(x, q=75)
features = {'value' : {'min' : np.min, 'max' : np.max, 'mean' : np.mean,
'q_1' : q_1, 'median' : np.median, 'q_3' : q_3 }}
Extract the voxel data, calculate statistical features for all regions and subjects, and put them into a pd.DataFrame.
In [5]:
path_to_tracer_data = './data/func'
items = os.listdir(path_to_tracer_data)
subjects = hd_classifier.make_subjects(items, path_to_tracer_data)
image_filter = '^r.*\.lin_T1_orientOK_skullstripped_norm_sm6mm.nii'
masked_region_df_pbr, masked_region_features_pbr = hd_classifier.extract_features(subjects, features, image_filter)
masked_region_features_pbr
Out[5]:
Calculate the principal component rotated basis
In [6]:
from sklearn.decomposition import PCA
pca = PCA(whiten=True)
pca.fit(masked_region_features_pbr.as_matrix())
rotated_subjects_pbr = pca.fit_transform(masked_region_features_pbr.as_matrix())
# Note that we flip the axis of the first feature, assigned randomly by the algorithm, so increases from
# controls to patients.
pca_results = pd.DataFrame(dict(first_feature=-rotated_subjects_pbr[:,0],
second_feature=rotated_subjects_pbr[:,1],
subject_id = masked_region_features_pbr.index.values))
pca_results.sort_values(by='first_feature', inplace=True)
pca_results.reset_index(drop=True, inplace=True)
pca_results = pca_results.merge(participants_df[['subject_id', 'group']], on='subject_id')
pca_results
Out[6]:
In [7]:
# Make a list with all the name features as region_feature to label the plots. E.g. thalamus_min
pca_features = [(item[1] + ' ' + item[0]).title()
for item in itertools.product(list(masked_region_features_pbr.columns.levels[1]),
list(masked_region_features_pbr.columns.levels[2]))]
# Get the coefficient of each feature for the first 3 principal axis
transformed_features = sorted(zip(pca_features, *pca.components_[0:4]), key=lambda t: t[0])
Let's order the data according to the score in the plot above.
In [8]:
masked_region_df_pbr['subject_id'] = pd.Categorical(masked_region_df_pbr['subject_id'], pca_results.subject_id)
Just a couple of features classify inflammation in patients vs controls. Even only one feature, classifies the subjects in at least two groups
In [9]:
# Some values overlap in the final plot, so we add some ad-hoc jittering
jitters = np.zeros(14)
jitters[0] = -.01
jitters[1] = .02
jitters[2] = -.02
jitters[3] = .01
jitters[4] = -.02
jitters[5] = .02
jitters[6] = -.03
jitters[7] = .03
jitters[8] = -.01
jitters[9] = .01
jitters[10] = -.03
jitters[11] = .03
jitters[12] = -.01
jitters[13] = .02
joined = pca_results.join(pd.DataFrame(dict(jitters=jitters)))
groups = joined.groupby('group')
sns.set_style("ticks")
fig = plt.figure(figsize=(14, 6))
outer_grid = gridspec.GridSpec(1, 2, wspace=0.1, width_ratios=[4,4])
right_plot = gridspec.GridSpecFromSubplotSpec(2, 1,
subplot_spec=outer_grid[0], hspace=0, height_ratios=[1,3])
marker_size = 350
# plot the first and second components in a scatter plot
ax = plt.Subplot(fig, right_plot[1])
for name, items in groups:
ax.scatter(items.first_feature, items.second_feature, s=marker_size, alpha=0.4,
c=subject_colors[name], label=name)
for idx in range(len(pca_results.index)):
ax.text(pca_results.first_feature[idx], pca_results.second_feature[idx], str(idx+1),
horizontalalignment='center', verticalalignment='center')
ax.set_xlabel('first PCA axis')
ax.set_ylabel('second PCA axis')
ax.legend(labelspacing=1.4)
fig.add_subplot(ax)
sns.set_style("whitegrid")
# plot the first component in a line on top
ax_top = plt.Subplot(fig, right_plot[0])
for name, items in groups:
ax_top.scatter(items.first_feature, items.jitters, s=marker_size, alpha=0.4,
color=subject_colors[name], label=name)
ax_top.set_xticks([])
ax_top.set_yticks([0])
ax_top.set_yticklabels([])
ax_top.set_ylabel('')
ax_top.xaxis.set_label_coords(0.5, 0.88)
ax_top.set_xlabel('first PCA axis')
ax_top.set(xlim=ax.get_xlim())
for sp in ax_top.spines.values(): sp.set_visible(False)
fig.add_subplot(ax_top)
# plot the eigenvectors in the original feature space
number_of_components = 3
number_of_features = len(transformed_features)
left_plot = gridspec.GridSpecFromSubplotSpec(1, number_of_components,
subplot_spec=outer_grid[1], wspace=0.18)
for i in range(number_of_components):
ax = plt.Subplot(fig, left_plot[i])
ax.plot([item[i+1] if i != 0 else -item[i+1] for item in transformed_features], list(range(number_of_features)))
ax.set_title(round(pca.explained_variance_ratio_[i],2))
ax.set(xlim=(-.4, .4))
ax.set(ylim=(0, number_of_features-1))
ax.set_yticks(list(range(number_of_features)))
ax.set_yticklabels([item[0] for item in transformed_features] if i==2 else [])
ax.yaxis.tick_right()
ax.set_xticks([-0.4, -.2, 0, .2, 0.4])
ax.set_xticklabels([-0.4, -0.2, 0, 0.2, 0.4])
for i in range(0,4):
ax.axhline(y=5.5+6*i, ls='dashed', c='black', alpha=0.4)
fig.add_subplot(ax)
fig.savefig('results/figs/pca_analysis.pdf', format='pdf')
In [10]:
merged = masked_region_df_pbr.merge(pca_results[['group', 'subject_id', 'first_feature']], on='subject_id')
merged.sort_values(by='first_feature', inplace=True)
merged.reset_index(drop=True, inplace=True)
regions = ['pallidum', 'putamen', 'caudate', 'thalamus']
region_data = list(map(lambda r: merged[merged['region'] == r], regions))
def plot_roi_histograms(data, ax, ax_hist):
sns.violinplot(data=data, x="subject_id", y="value", bw=.2,
scale='count', cut=1, linewidth=1, ax=ax)
groups = data.groupby('group')
for name, group in groups:
sns.kdeplot(group['value'], vertical=True, ax=ax_hist,
label=name, color=subject_colors[name],
ls=('--' if name=='pre-HD' else '-'))
fig = plt.figure(figsize=(14, 6))
# gridspec inside gridspec
outer_grid = gridspec.GridSpec(2, 2, wspace=0.1, hspace=0.1)
for i in range(4):
inner_grid = gridspec.GridSpecFromSubplotSpec(1, 2,
subplot_spec=outer_grid[i], wspace=0.0, hspace=0.0, width_ratios=[9,2],)
ax = plt.Subplot(fig, inner_grid[0])
ax_hist = plt.Subplot(fig, inner_grid[1])
plot_roi_histograms(region_data[i], ax, ax_hist)
ax.set_title(regions[i])
ax.set(ylim=(masked_region_df_pbr['value'].min(), masked_region_df_pbr['value'].max()))
ax_hist.set(ylim=(masked_region_df_pbr['value'].min(), masked_region_df_pbr['value'].max()))
ax.set_yticks([0.6, 0.8, 1, 1.2, 1.4, 1.6])
# show only xticklabels only for the lower plots and show the patient number instead of subject_id
if i in [0,1]:
ax.set_xticks([])
else:
ax.set_xticklabels(list(range(1, len(merged)+1)))
ax_hist.set_xticks([])
ax_hist.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
fig.add_subplot(ax)
fig.add_subplot(ax_hist)
all_axes = fig.get_axes()
#show only the outside spines
for ax in all_axes:
for sp in ax.spines.values():
sp.set_visible(False)
plt.savefig('results/figs/regions_of_interest_pbr.pdf', format='pdf')
We can assess the difference between the distributions above using a permutation test. We compare the distributions for the control and HD for each region using the mean and the median of each subject as input.
In [11]:
def bipartition_means_difference(seq, size_l, size_r):
'''
Calculate the difference between the averages of a sequence bipartition
Parameters
==========
seq: numpy array like object
size_l: int size of the left part
size_r: int size of the right part
'''
if len(seq) != size_l + size_r:
raise Exception('Not a bipartition')
np.random.shuffle(seq)
left = seq[:size_l]
right = seq[-size_r:]
return (left.mean() - right.mean())
def permutatation_test(z, y, num_samples):
'''
Calculate the p-value for a permutation test with num_samples
Parameters
==========
z: numpy array like object with one group of observations
y: numpy array like object with another group of observations
num_samples: int with the number of samples
'''
pooled = np.hstack([z,y])
delta = z.mean() - y.mean()
estimates = map(lambda x: bipartition_means_difference(pooled, z.size, y.size),
range(num_samples))
count = len(list(filter(lambda x: x > delta, estimates)))
return 1.0 - float(count)/float(num_samples)
num_samples = 100000
for stat in ['median', 'mean']:
print('Permutations test p-values using the {0} as input'.format(stat))
for region in regions:
controls = np.array(masked_region_features_pbr['value'][stat][region][:3])
hd = np.array(masked_region_features_pbr['value'][stat][region][4:])
print (region, permutatation_test(controls, hd, num_samples))
In [12]:
region_volumes = masked_region_df_pbr.groupby(['subject_id', 'region']).agg('count').unstack()
# Pretty ugly fix: no time to do something smarter
region_volumes.columns = [' '.join(col).strip().split(' ')[1] for col in region_volumes.columns.values]
intracraneal_volume_df = participants_df[['subject_id', 'intracraneal_volume']].set_index('subject_id')
merged = region_volumes.join(intracraneal_volume_df)
merged = merged.div(merged.intracraneal_volume, axis='index')
normalized_region_volumes = hd_classifier.normalize(merged)
normalized_region_volumes.drop('intracraneal_volume', axis=1, inplace=True)
to_fit = normalized_region_volumes.join(pca_results[['first_feature', 'subject_id', 'group']].set_index('subject_id'))
to_fit
Out[12]:
In [13]:
from statsmodels.stats.outliers_influence import summary_table
import statsmodels.api as sm
sns.set_style("ticks")
def fit_region_volume_vs_score(to_fit, region):
x = to_fit.first_feature
y = to_fit[region].values
X = sm.add_constant(x)
return (x, y, sm.OLS(y, X).fit())
x, y, re = fit_region_volume_vs_score(to_fit, 'caudate')
print(re.summary())
st, data, ss2 = summary_table(re, alpha=0.05)
fittedvalues = data[:,2]
predict_mean_se = data[:,3]
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
predict_ci_low, predict_ci_upp = data[:,6:8].T
plt.plot(x, fittedvalues, 'b-', lw=1)
plt.plot(x, predict_ci_low, 'r--', lw=1.5)
plt.plot(x, predict_ci_upp, 'r--', lw=1.5)
plt.plot(x, predict_mean_ci_low, 'b--', lw=1)
plt.plot(x, predict_mean_ci_upp, 'b--', lw=1)
groups = to_fit.groupby('group')
for name, items in groups:
plt.scatter(items.first_feature, items.caudate, s=marker_size, alpha=0.4,
c=subject_colors[name], label=name)
for idx in range(len(to_fit)):
plt.text(x[idx], y[idx], str(idx+1),
horizontalalignment='center', verticalalignment='center')
plt.xlabel('first PCA axis')
plt.ylabel('caudate volume')
plt.legend(labelspacing=1.4)
plt.savefig('results/figs/region_volumes_vs_inflammation_scores.pdf', format='pdf')
In [14]:
for region in ['pallidum', 'putamen', 'thalamus']:
_, _, re = fit_region_volume_vs_score(to_fit, region)
print('Correlation between ' + region + ' volume and first PC')
print(re.summary())
print('+' * 91)
Get the images in MNI space, instead of subject space, so we can use the atlas, and then apply all the masks in the atlas to each of the subject's MNI images. The result is a dictionary masked_imgs with the masked images for each nuclei and patient.
As the license of the Morel atlas does not allow redistribution, you need to get a licensed copy of the Morel atlas before using the code below to generate the plots yourself.
In [15]:
from functools import reduce
from nilearn.image import resample_to_img, new_img_like
from nilearn.masking import apply_mask
def merge_nuclei_masks(nuclei, nuclei_masks, imgs):
masks_to_join = []
for k in nuclei:
resampled = resample_to_img(nuclei_masks[k], imgs[0])
masks_to_join.append(resampled.get_data() > 0.5)
joint_mask_data_as_bool = reduce(np.logical_or, masks_to_join)
joint_mask_as_bool = new_img_like(imgs[0], joint_mask_data_as_bool)
masked_nuclei = apply_mask(imgs, joint_mask_as_bool)
return masked_nuclei
def find_threshold(imgs, q):
from math import floor
parts = []
for img in imgs:
threshold_index = floor(q * len(img))
parts.append(np.partition(img, threshold_index)[threshold_index:])
merged = np.concatenate(parts)
merged_threshold = floor (1- len(merged) / len(parts))
return np.partition(merged, merged_threshold)[merged_threshold]
def make_masked_nuclei_imgs(imgs, path_to_morel_atlas, excludes):
if not os.path.isdir(path_to_morel_atlas):
raise FileNotFoundError
left_volumes = glob.glob(os.path.join(path_to_morel_atlas, 'left-vols-1mm/*.nii.gz'))
right_volumes = glob.glob(os.path.join(path_to_morel_atlas,'right-vols-1mm/*.nii.gz'))
def parse_nuclei_name(vol):
return os.path.dirname(vol).split('/')[-1].split('-')[0] + '_' + os.path.basename(vol).split('.')[0]
nuclei_mask_dict = { parse_nuclei_name(vol) : vol
for vol in left_volumes + right_volumes
if not ''.join(parse_nuclei_name(vol).split('_')[1:]).startswith(tuple(excludes)) }
nuclei_masks = { k: nib.load(v) for k, v in nuclei_mask_dict.items() }
masked_img = {}
nuclei_sizes = {}
for k, v in nuclei_masks.items():
resampled = resample_to_img(v, imgs[0])
resampled_data_as_bool = resampled.get_data() > 0.5
nuclei_sizes[k] = np.sum(resampled_data_as_bool)
resampled_as_bool = new_img_like(resampled, resampled_data_as_bool)
try:
masked_img[k] = apply_mask(imgs, resampled_as_bool)
except:
print('Something is wrong for nucleus {}, but I can keep going for the rest.'.format(k))
continue
return masked_img, nuclei_masks, nuclei_sizes
path_to_tracer_data = './data/func'
items = os.listdir(path_to_tracer_data)
subjects = hd_classifier.make_subjects(items, path_to_tracer_data)
mni_image_filter = 'nl_MNI152_norm_sm6mm.nii'
subjects.sort(key=lambda s: pca_results[pca_results['subject_id']==s.subject_id].index.tolist()[0])
assert ([s.subject_id for s in subjects] == list(pca_results['subject_id']))
mni_images = map(lambda s: hd_classifier.find_masks(s.images, mni_image_filter), subjects)
imgs = [nib.load(image) for image in list(itertools.chain.from_iterable(mni_images))]
path_to_morel_atlas = './data/private/Atlas/Morel'
try:
excluded_nuclei = [] # = ['global', 'MAX']
masked_img, nuclei_masks, nuclei_sizes = make_masked_nuclei_imgs(imgs, path_to_morel_atlas, excluded_nuclei)
except FileNotFoundError:
print("You need a copy of the Morel Atlas")
Plot the histogram of uptake for each relevant nuclei and subject. For the remaining nuclei in the Morel Atlas differences between controls and patients are not as marked as for those selected. The vertical lines are a guide ot the eye and mark the value of SUVR corresponding to the 95 percentile of all voxels in the control group.
In [16]:
relevant_nuclei_group_names = ['left_VLpv', 'left_VApc', 'left_PuL']
relevant_nuclei_groups = { k: [it for it in list(nuclei_masks.keys()) if k in it] for k in relevant_nuclei_group_names }
masked_relevant_groups = {k: merge_nuclei_masks(v, nuclei_masks, imgs) for k, v in relevant_nuclei_groups.items()}
fig = plt.figure(figsize=(10, 7))
num_subjects = len(subjects)
# gridspec two split all patients and averages
outer_grid = gridspec.GridSpec(1, len(masked_relevant_groups), wspace=0.09, hspace=0)
min_x, max_x = 0.4, 2
pallete = sns.color_palette("hls", num_subjects)
grey_shadow = '#857e7e'
for j, (k, v) in enumerate(masked_relevant_groups.items()):
column_grid = gridspec.GridSpecFromSubplotSpec(num_subjects, 1,
subplot_spec=outer_grid[j], wspace=0, hspace=0.0)
for i in range(0, num_subjects):
threshold = find_threshold(masked_img[k][:6], 0.90)
ax = plt.Subplot(fig, column_grid[i])
ax.set(xlim=(min_x, max_x))
if i == 0: ax.set_title(k)
if i == num_subjects - 1:
ax.set_xticks([min_x, threshold, max_x])
ax.set_xticklabels([min_x, round(threshold, 2), max_x])
else:
ax.set_xticks([])
ax.set_yticks([])
ax.set_yticklabels([])
ax.set_xlabel('')
if j == 0:
ax.set_ylabel(str(i+1), rotation='horizontal')
ax.axvline(x=threshold, color=grey_shadow, ls=':')
sns.kdeplot(v[i], shade=True, color=pallete[i], # use the same colors as before to identify subjects
ax=ax)
#if 'left_PuL' in k:
# sns.kdeplot(masked_img['right_PuL'][i], shade=True, color=grey_shadow, ax=ax)
sns.kdeplot(masked_img['left_global'][i], ls='--', ax=ax, color=grey_shadow)
fig.add_subplot(ax)
plt.savefig('results/figs/nuclei.pdf', format='pdf')