In [1]:
from __future__ import division
import os
from ipywidgets import interact
from ipywidgets import widgets
import nibabel as nb
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn.gaussian_process as skg
import sklearn.neighbors as skn
import sklearn.mixture as skm
import glob
import pandas as pd
import seaborn as sb

%matplotlib inline

In [2]:
histogram = np.genfromtxt("../data/set_train_histogram.csv", skip_header=True, delimiter=",")
n = 1000000
histogram[:, 1] /= np.sum(histogram[:, 1])

In [3]:
weights = np.array([0.0620489, 0.722237, 0.215714])
means = np.array([268.8299584, 825.967384, 1339.920449])
sds = np.array([65.4319413, 261.051618, 93.081572])

In [4]:
plt.figure()
plt.plot(histogram[:, 0], histogram[:, 1]);
plt.vlines(means, ymin=0, ymax=np.max(histogram[:, 1]))


Out[4]:
<matplotlib.collections.LineCollection at 0x7fd1ac16c690>
/home/matteo/.pyenv/versions/2.7.9/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [5]:
pdfs = [scipy.stats.norm(loc=mean, scale=sd) for mean, sd in zip(means, sds)]

plt.figure()
x = np.arange(0, 3000, 1)
print(x)
plt.plot(x, weights[0]*pdfs[0].pdf(x))
plt.plot(x, weights[1]*pdfs[1].pdf(x))
plt.plot(x, weights[2]*pdfs[2].pdf(x))
plt.plot(x, weights[0]*pdfs[0].pdf(x) + weights[1]*pdfs[1].pdf(x) + weights[2]*pdfs[2].pdf(x))
gm_wm_threshold = x[np.argmax(weights[2]*pdfs[2].pdf(x) / weights[1]*pdfs[1].pdf(x))]
plt.vlines(gm_wm_threshold, ymin=0, ymax=np.max(weights[1] * pdfs[1].pdf(x)))


[   0    1    2 ..., 2997 2998 2999]
Out[5]:
<matplotlib.collections.LineCollection at 0x7fd1fb72bcd0>

In [6]:
plt.figure()
plt.plot(x, weights[2]*pdfs[2].pdf(x) / weights[1]*pdfs[1].pdf(x))
plt.plot(x, weights[0]*pdfs[0].pdf(x) / weights[1]*pdfs[1].pdf(x))


Out[6]:
[<matplotlib.lines.Line2D at 0x7fd1fb6ec690>]

In [7]:
def show_image(path, x, y, z):
    img = nb.load(path).get_data()[..., 0]
    limits = (0, 2000)
    fig = plt.figure(figsize=(20, 20)) 
    plt.subplot(1, 3, 1)
    plt.imshow(img[:, :, z],
                        cmap=plt.cm.gray,
                        interpolation='none'
                        )
    plt.clim(*limits)
    plt.subplot(1, 3, 2)
    plt.imshow(img[:, y, :],
                        cmap=plt.cm.gray,
                        interpolation='none'
                        )
    plt.clim(*limits)
    plt.subplot(1, 3, 3)
    plt.imshow(img[x, :, :],
                        cmap=plt.cm.gray,
                        interpolation='none'
              )
    plt.clim(*limits)
    segmentation_seed = np.zeros(img.shape, dtype=np.uint8)
    segmentation_seed[img == 0] = 1
    segmentation_seed[np.logical_and(img > 0, img < 300)] = 2
    segmentation_seed[np.logical_and(img > 650, img < 850)] = 3
    segmentation_seed[np.logical_and(img > 1050, img < 1450)] = 4     
    fig = plt.figure(figsize=(20, 20)) 
    plt.subplot(1, 3, 1)
    plt.imshow(np.rot90(segmentation_seed[:, :, z]),
                        cmap=plt.cm.gray,
                        interpolation='none'
                        )
    
    plt.subplot(1, 3, 2)
    plt.imshow(np.rot90(segmentation_seed[:, y, :]),
                        cmap=plt.cm.gray,
                        interpolation='none'
                        )
    plt.subplot(1, 3, 3)
    plt.imshow(np.rot90(segmentation_seed[x, :, :]),
                        cmap=plt.cm.gray,
                        interpolation='none'
              )
    plt.figure()
    plt.hist(img[img > 0].flatten(), bins=256)
    plt.yscale('log', nonposy='clip')

In [8]:
shape = (176, 208, 176)
path = widgets.Text("../data/set_train/train_1.nii")
interact(
    show_image,
    path=path,
    x=widgets.IntSlider(min=0, max=shape[0] - 1, step=1, value=shape[0] // 2),
    y=widgets.IntSlider(min=0, max=shape[1] - 1, step=1, value=shape[1] // 2),
    z=widgets.IntSlider(min=0, max=shape[2] - 1, step=1, value=shape[2] // 2),
)


Out[8]:
<function __main__.show_image>

In [9]:
file_names = glob.glob("../data/set_train/train_*.nii")
def get_voxel(x, y, z):
    age_id = np.genfromtxt("../data/targets.csv")
    voxels = np.zeros(shape=len(file_names), dtype=np.int16)
    ages = np.zeros(shape=len(file_names), dtype=np.int16)
    for j, file_name in enumerate(file_names):
        index = int(os.path.splitext(os.path.basename(file_name))[0].split("_")[1]) - 1
        voxel = nb.load(file_name).dataobj[x, y, z]
        voxels[j] = voxel
        ages[j] = age_id[index]
        
    kernel = (
        skg.kernels.ConstantKernel(constant_value=500, constant_value_bounds=(100, 2000))
        * skg.kernels.RBF(length_scale=5, length_scale_bounds=(3, 200))
        + skg.kernels.WhiteKernel(noise_level=1000, noise_level_bounds=(1e3, 1e4))
    )
    
    corr = np.abs(np.corrcoef(ages, voxels)[0, 1])
    print("Correlation:", corr)
    gp = skg.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
    gp.fit(ages.reshape(-1, 1), voxels)
    x = np.arange(18, 95).reshape(-1, 1)
    y_pred, sigma = gp.predict(x, return_std=True)
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.scatter(ages, voxels)
    plt.plot(x, y_pred, color="r")
    plt.fill(
        np.concatenate([x, x[::-1]]),
        np.concatenate([y_pred - sigma,
                        (y_pred + sigma)[::-1]]),
         alpha=.5, fc='b', ec='None', label='1sigma confidence interval')
    plt.ylim(0, 2000)
    print(gp.kernel_.get_params()["k1__k2__length_scale"])
    
    scaling_factor = 15
    bidimensional = np.array(zip(ages, voxels / scaling_factor))
    bandwidth = 15
    kde = skn.KernelDensity(kernel="exponential", bandwidth=bandwidth)
    kde.fit(bidimensional)
    x = np.arange(18, 95)
    y = np.arange(0, 2000 / scaling_factor)
    xx, yy = np.meshgrid(x, y)
    xy = np.dstack((xx, yy)).reshape(-1, 2)
    z = np.exp(kde.score_samples(xy))
    
    plt.subplot(1, 2, 2)
    plt.scatter(ages, voxels)
    plt.contourf(xx, yy * scaling_factor, z.reshape(xx.shape), cmap="Purples", alpha=0.5)

    
interact(
    get_voxel,
    x=widgets.IntSlider(min=0, max=shape[0] - 1, step=1, value=shape[0] // 2),
    y=widgets.IntSlider(min=0, max=shape[1] - 1, step=1, value=shape[1] // 2),
    z=widgets.IntSlider(min=0, max=shape[2] - 1, step=1, value=shape[2] // 2),
    )


('Correlation:', 0.02714096551566925)
200.0
Out[9]:
<function __main__.get_voxel>

In [10]:
get_voxel(88, 104, 88);
get_voxel(107, 76, 100);
get_voxel(98, 121, 88);


('Correlation:', 0.02714096551566925)
200.0
('Correlation:', 0.37155967562378522)
84.0505608067
('Correlation:', 0.75249754417280657)
38.7621542353

Probability distribution


In [11]:
def probability_density(x, y, z, value):
    age_id = np.genfromtxt("../data/targets.csv")
    voxels = np.zeros(shape=len(file_names), dtype=np.int16)
    ages = np.zeros(shape=len(file_names), dtype=np.int16)
    for j, file_name in enumerate(file_names):
        index = int(os.path.splitext(os.path.basename(file_name))[0].split("_")[1]) - 1
        voxel = nb.load(file_name).dataobj[x, y, z]
        voxels[j] = voxel
        ages[j] = age_id[index]
    
    scaling_factor = 15        
    bidimensional = np.array(zip(ages, voxels / scaling_factor))
    bandwidth = 15
    kde = skn.KernelDensity(kernel="exponential", bandwidth=bandwidth)
    kde.fit(bidimensional)
    x = np.arange(1, 120)
    y = np.arange(0, 2000 / scaling_factor)
    xx, yy = np.meshgrid(x, y)
    xy = np.dstack((xx, yy)).reshape(-1, 2)
    z = np.exp(kde.score_samples(xy))
    
    value = value / scaling_factor
    xy = np.vstack((x, np.tile(value, x.shape[0]))).T
    histogram = np.exp(kde.score_samples(xy))
    ages_kde = skn.KernelDensity(kernel="gaussian", bandwidth=3)
    ages_kde.fit(ages.reshape(-1, 1))
    prior = np.exp(ages_kde.score_samples(x.reshape(-1, 1)))
    prior /= np.sum(prior)
    histogram /= np.sum(histogram)

    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.scatter(ages, voxels)
    plt.contourf(xx, yy * scaling_factor, z.reshape(xx.shape), cmap="Purples", alpha=0.5)
    plt.hlines(value * scaling_factor, x[0], x[-1], color="blue")
    plt.subplot(1, 2, 2)
    plt.bar(x - 0.5, histogram, width=1)
    average_age = np.dot(x, histogram)
    mode_age = x[np.argmax(histogram)]
    plt.vlines((average_age, mode_age), 0, np.max(histogram), color="red")
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x - 0.5, prior)
    cumulative = np.cumsum(histogram)
    plt.subplot(1, 2, 2)
    plt.bar(x - 0.5, cumulative)


    
interact(
    probability_density,
    x=widgets.IntSlider(min=0, max=shape[0] - 1, step=1, value=shape[0] // 2),
    y=widgets.IntSlider(min=0, max=shape[1] - 1, step=1, value=shape[1] // 2),
    z=widgets.IntSlider(min=0, max=shape[2] - 1, step=1, value=shape[2] // 2),
    value=widgets.IntSlider(min=0, max=2000, step=10, value=400),
    )


Out[11]:
<function __main__.probability_density>

In [12]:
probability_density(98, 121, 88, 700);



In [13]:
a = np.genfromtxt("table.csv")
plt.figure()
plt.scatter(a[:, 1], a[:, 2])


Out[13]:
<matplotlib.collections.PathCollection at 0x7fd1ba2d4590>

Age/voxel correlation


In [14]:
corr_files = glob.glob("../output/correlation_output/*")
corr_data_frame = pd.concat([pd.read_csv(corr_file, header=None) for corr_file in corr_files])
corr_data_frame.sort_values(by=0, inplace=True)
correlation_brain = np.zeros(shape=shape[0] * shape[1] * shape[2], dtype=float)
for row in corr_data_frame.itertuples():
    correlation_brain[row[1]] = row[2]

In [15]:
cdict = {
    "red": [(0, 0, 0), (1, 1, 1)],
    "green": [(0, 0, 0), (1, 0, 0)],
    "blue": [(0, 0, 0), (1, 0, 0)],
    "alpha": [(0, 0, 0), (1, 1, 1)],
}
plt.register_cmap(name='RedAlpha', data=cdict)

def show_correlation(x, y, z):
    correlation_threshold = 0.6
    img = np.reshape(correlation_brain, shape)
    img_binary = np.zeros_like(img)
    img_binary[img > correlation_threshold] = 1
    fig = plt.figure(figsize=(20, 20)) 
    plt.subplot(1, 3, 1)
    limits = 0, 1
    plt.imshow(np.rot90(img[:, :, z]),
                        cmap=plt.cm.gray,
                        interpolation='none'
                        )
    plt.imshow(np.rot90(
            img_binary[:, :, z]),
            cmap="RedAlpha",
            interpolation='none',
            alpha=0.2,
            )
    plt.clim(*limits)
    plt.subplot(1, 3, 2)
    plt.imshow(np.rot90(img[:, y, :]),
                        cmap=plt.cm.gray,
                        interpolation='none'
                        )
    plt.imshow(np.rot90(
            img_binary[:, y, :]),
            cmap="RedAlpha",
            interpolation='none',
            alpha=0.2,
            )
    plt.clim(*limits)
    plt.subplot(1, 3, 3)
    plt.imshow(np.rot90(img[x, :, :]),
                        cmap=plt.cm.gray,
                        interpolation='none'
              )
    plt.imshow(np.rot90(
            img_binary[x, :, :]),
            cmap="RedAlpha",
            interpolation='none',
            alpha=0.2,
            )
    plt.clim(*limits)
    
    plt.figure()
    plt.hist(correlation_brain[correlation_brain > 0], bins=100)
    print(np.size(correlation_brain[correlation_brain > correlation_threshold]))
    
interact(
    show_correlation,
    x=widgets.IntSlider(min=0, max=shape[0] - 1, step=1, value=shape[0] // 2),
    y=widgets.IntSlider(min=0, max=shape[1] - 1, step=1, value=shape[1] // 2),
    z=widgets.IntSlider(min=0, max=shape[2] - 1, step=1, value=shape[2] // 2),
    )


44816
Out[15]:
<function __main__.show_correlation>

Fitting function


In [16]:
import scipy.stats

train_files = glob.glob("../data/filtered_set_train/*")
train = [0] * len(train_files)
ages = np.genfromtxt("../data/targets.csv")
train_wm = [0] * len(train_files)
for file_name in train_files:
    basename = os.path.splitext(os.path.basename(file_name))[0]
    i = int(basename.split("_")[1]) - 1
    a = np.load(file_name)
    train[i] = a
    train_wm[i] = np.size(a[a > 1000])
train = np.array(train)

gm_files = glob.glob("../data/frontal_thresholding_set_train/*")
train_gm = [0] * len(gm_files)
for file_name in gm_files:
    basename = os.path.splitext(os.path.basename(file_name))[0]
    i = int(basename.split("_")[1]) - 1
    train_gm[i] = np.genfromtxt(file_name)
    print(train_gm[i])
    
    
plt.figure()
wm = pd.DataFrame({"age": ages, "wm": np.array(train_wm), "gm": np.array(train_gm)})
sb.lmplot(y="wm", x="age", data=wm, order=4)
sb.pairplot(wm, hue='age', palette='Blues')

def fit_correlated_voxel(i):
    #plt.figure()
    #plt.scatter(ages, train[:, i])
    y = train[:, i]
    x = ages
    data = pd.DataFrame({"age": x, "value": y})
    sb.lmplot(y="value", x="age", data=data, order=1)
    slope, intercept, _, _, _ = scipy.stats.linregress(x, y)
    print(slope, intercept)
    residuals = np.abs(y - slope * ages - intercept)
    kernel = (
        45 ** 2
        * skg.kernels.RBF(length_scale=30, length_scale_bounds=(10, 60))
        + skg.kernels.WhiteKernel(noise_level=10000, noise_level_bounds=(1e3, 1e5))
    )
    gp = skg.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=0)
    gp.fit(x.reshape(-1, 1), residuals)
    x_pred = np.arange(18, 95).reshape(-1, 1)
    y_pred = gp.predict(x_pred)
    plt.figure()
    plt.scatter(x, residuals)
    plt.plot(x_pred, y_pred, color="r")
    print(gp.kernel_.get_params())
    
interact(
    fit_correlated_voxel,
    i=widgets.IntSlider(min=0, max=train.shape[1] - 1, step=1, value=train.shape[1] // 2),
    )


(-6.2329038192172685, 748.95754863889988)
{'k1__k1': 118**2, 'k2__noise_level': 15070.147973378505, 'k1__k2': RBF(length_scale=60), 'k1__k1__constant_value': 13875.985282581498, 'k1__k2__length_scale_bounds': (10, 60), 'k2__noise_level_bounds': (1000.0, 100000.0), 'k1__k1__constant_value_bounds': (1e-05, 100000.0), 'k2': WhiteKernel(noise_level=1.51e+04), 'k1': 118**2 * RBF(length_scale=60), 'k1__k2__length_scale': 59.970482871009445}
Out[16]:
<function __main__.fit_correlated_voxel>
<matplotlib.figure.Figure at 0x7fd1ba10f850>

In [ ]: