In [2]:
import csv
from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.spatial import Delaunay
import scipy.misc
import numpy as np
import math
import skimage
import matplotlib.pyplot as plt
import seaborn as sns
from skimage import future
import networkx as nx
from ragGen import *
%matplotlib inline
sns.set_color_codes("pastel")
from scipy.signal import argrelextrema
from sklearn import datasets, linear_model
In [3]:
# Read in the data
data = open('../../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()
rows = [[int(col) for col in row] for row in reader]
# These will come in handy later
sorted_x = sorted(list(set([r[0] for r in rows])))
sorted_y = sorted(list(set([r[1] for r in rows])))
sorted_z = sorted(list(set([r[2] for r in rows])))
We'll start with just looking at analysis in euclidian space, then thinking about weighing by synaptic density later. Since we hypothesize that our data will show that tissue varies as we move down the y-axis (z-axis in brain) through cortical layers, an interesting thing to do would be compare properties of the graphs on each layer (ie how does graph connectivity vary as we move through layers).
Let's start by triangulating our data. We'll use Delaunay on each y layer first. Putting our data in the right format for doing graph analysis...
In [4]:
a = np.array(rows)
b = np.delete(a, np.s_[3::],1)
# Separate layers - have to do some wonky stuff to get this to work
b = sorted(b, key=lambda e: e[1])
b = np.array([v.tolist() for v in b])
b = np.split(b, np.where(np.diff(b[:,1]))[0]+1)
Now that our data is in the right format, we'll create 52 delaunay graphs. Then we'll perform analyses on these graphs. A simple but useful metric would be to analyze edge length distributions in each layer.
In [5]:
graphs = []
centroid_list = []
for layer in b:
centroids = np.array(layer)
# get rid of the y value - not relevant anymore
centroids = np.delete(centroids, 1, 1)
centroid_list.append(centroids)
graph = Delaunay(centroids)
graphs.append(graph)
We're going to need a method to get edge lengths from 2D centroid pairs
In [6]:
def get_d_edge_length(edge):
(x1, y1), (x2, y2) = edge
return math.sqrt((x2-x1)**2 + (y2-y1)**2)
In [7]:
edge_length_list = [[]]
tri_area_list = [[]]
for del_graph in graphs:
tri_areas = []
edge_lengths = []
triangles = []
for t in centroids[del_graph.simplices]:
triangles.append(t)
a, b, c = [tuple(map(int,list(v))) for v in t]
edge_lengths.append(get_d_edge_length((a,b)))
edge_lengths.append(get_d_edge_length((a,c)))
edge_lengths.append(get_d_edge_length((b,c)))
try:
tri_areas.append(float(Triangle(a,b,c).area))
except:
continue
edge_length_list.append(edge_lengths)
tri_area_list.append(tri_areas)
Realizing after all this that simply location is useless. We know the voxels are evenly spaced, which means our edge length data will be all the same. See that the "centroids" are no different:
In [8]:
np.subtract(centroid_list[0], centroid_list[1])
Out[8]:
There is no distance between the two. Therefore it is perhaps more useful to consider a graph that considers node weights. Voronoi is dual to Delaunay, so that's not much of an option. We want something that considers both spacial location and density similarity.
In the past we've considered density information alone (ie analysis density histogram distribution) and above we are considering only spacial information, which totally doesn't say anything. To construct a graph that consider both spacial and density information, we'll use a Region Adjacency Graph (RAG).
In RAGs, two nodes are considered as neighbor if they are close in proximity (separated by a small number of pixels/voxels) in the horizontal or vertical direction.
Since our data includes density values at each node (voxel, or pixel since we're looking at y-layers), we can weight by the inverse of density difference between two nodes. Inverse because strongly connected nodes should be close in weight.
We have number of synapses Si at nodes $i$ and define weights $w$ between the nodes:
$$w = \dfrac{1}{S_i - S_{i+1}}$$RAGs are used largely in image processing, and it makes sense for our data to look more like an image. Since the data is evenly spaced, the absolute locations of the voxels don't matter. We can use the index in the matrix to represent spacial location, with the value at each pixel being the synapse density at that voxel. We've done this before in "real volume"
In [9]:
real_volume = np.zeros((len(sorted_y), len(sorted_x), len(sorted_z)))
for r in rows:
real_volume[sorted_y.index(r[1]), sorted_x.index(r[0]), sorted_z.index(r[2])] = r[-1]
vol = np.zeros((len(sorted_x), len(sorted_y), len(sorted_z)))
for r in rows:
vol[sorted_x.index(r[0]), sorted_y.index(r[1]), sorted_z.index(r[2])] = r[-1]
In [10]:
np.shape(real_volume)
Out[10]:
In [11]:
test_im = real_volume[1]
shape = np.shape(test_im)
ragu = generate_rag(test_im, False)
ragu.number_of_edges()
# ragu.adjacency_list()
Out[11]:
In [12]:
y_rags = []
for layer in real_volume:
y_rags.append(generate_rag(layer, False))
OK, great! Now we have a list of 52 region adjacency graphs for each y-layer. Now we want to measure properties of those graphs and see how the properties vary in the y direction - through what we hypothesize are the cortical layers.
This is just a sanity check. They should all be the same if we did it right!
In [13]:
num_edges = []
for rag in y_rags:
num_edges.append(rag.number_of_edges())
In [14]:
sns.barplot(x=range(len(num_edges)), y=num_edges)
sns.plt.show()
In [27]:
h = y_rags[0]
A = nx.adjacency_matrix(h)
np.set_printoptions(threshold=np.nan)
scipy.misc.imsave('outfile.jpg', image_array)
# Note - these are fairly big matrices, so not the best idea to print them outright.
# print(A.todense())
First we look at the default networkx graph plotting:
We're gonna need to massage the drawing function a bit in order to get the custom positional graph to work.
In [15]:
# for rag in y_rags:
# plt.figure()
# nx.draw(rag, node_size=100)
This is using the spring layout, so we're losing positional information. We can improve the plot by adding position information.
In [16]:
def get_edge_weight_distributions(rags):
distributions = []
for rag in rags:
itty = rag.edges_iter()
weight_list = []
for index in range(rag.number_of_edges()):
eddy = itty.next()
weight_list.append(rag.get_edge_data(eddy[0], eddy[1])['weight'])
distributions.append(weight_list)
return distributions
In [17]:
distributions = get_edge_weight_distributions(y_rags)
distributions = distributions[:40]
Note that we put padding on the data, ignoring the edge layers which appear to be inconsistent due to data generation methods (ie there is no tissue there)
In [18]:
count = 0
for distr in distributions:
plt.hist(distr, bins=150)
plt.title("Layer " + str(count) + " Nonlinear Edge Weight Histogram")
plt.ylabel('Frequency')
plt.xlabel('Edge Weight')
# plt.savefig("./figures/y_histogram_nonlinear/Layer_" + str(count) + "_Edge_Weight_Y_nonlin")
plt.show()
count+=1
The edge weights are a measure of connectivity. Thus, we want to see how connectivity changes through the y-layers, which we confidently believe represents deeper cortical layers. We'll start by taking the mean edge weight for each layer as a measure of connectivity for that layer.
Note: Connectivity here is not referring to connectivity in the neuroscience sense, but rather in the graph theory sense. Closely "connected" supervoxels are closer in density, but might both be low in absolute synaptic density, and thus less connected in the neuroscience sense. For the purposes of this analysis, I'll generally consider connectivity to be graph connectivity unless otherwise stated."
In [19]:
y_edge_means = []
for distrib in distributions:
y_edge_means.append(np.mean(distrib))
In [20]:
g = sns.barplot(x=range(len(y_edge_means)), y=y_edge_means, color='r')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Non-Linear Edge Weight Means for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Mean")
sns.plt.show()
While it is difficult to see any kind of pattern that might quantitatively separate cortical layers as we saw in the simple mean density plots through the y-axis, we do see a trend in increasing mean edge weights as we get deeper into the y-layers. Specifically, we can see below that connectivity increases about 33% between the first and last layer considered (layer 0 and 39). Remember that we found that the the tissue becomes less dense as we traverse down the y-axis (see here). So at the least, we see that as tissue becomes less dense, the connectivity of the RAG increases.
In [21]:
start_val = y_edge_means[0]
end_val = y_edge_means[len(y_edge_means)-1]
(end_val-start_val)/start_val
Out[21]:
In [82]:
regr = linear_model.LinearRegression()
xvals = np.array(list(range(1, len(y_edge_means)+1)))
xvals.reshape(-1,1)
yvals = np.array(y_edge_means)
# print yvals
# print xvals
regr.fit(xvals, y_edge_means)
Because edge weight is a function of difference in density for neighboring supervoxels, the increase in average edge weight might be due to the fact that the distribution of synapses throughout the tissue is becoming more sparse, and thus more irregular throughout the tissue. It's worth noting that we have already eliminated the possibility of edge effects. Namely, since deepest 12 y-layers are matrices of nearly all 0's with some noise, one would expect that the edge means would be very high. Indeed, we can see below that when we put those edges back in, there is a large spike in the last 12 layers when padding becomes relevant.
In [22]:
distributions_uncropped = get_edge_weight_distributions(y_rags)
In [23]:
y_edge_means_uncropped = []
for distrib in distributions_uncropped:
y_edge_means_uncropped.append(np.mean(distrib))
In [24]:
g = sns.barplot(x=range(len(y_edge_means_uncropped)), y=y_edge_means_uncropped, color='r')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Non-Linear Edge Weight Means for Each Y Layer, No Padding")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Mean")
sns.plt.show()
In interpreting this RAG, let's hypothesize that graph connectivity is negatively correlated with density variance. It's reasonable to expect that higher variance in density would translate into an increased probability of having two very different supervoxels next to each other, and thus lower graph connectivity. To check this, let's look at the trend in variance.
In [25]:
y_density_variances = []
for layer in real_volume:
y_density_variances.append(np.var(layer))
g = sns.barplot(x=range(len(y_density_variances[:40])), y=y_density_variances[:40], color='g')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Variance of Density by Y-Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Density Variance")
sns.plt.show()
Indeed, as variance increases, connectivity decreases - it's more likely that there is a higher difference in synaptic density in neighboring super pixels with higher variance. We can see this correlation quantitatively with Pearson's coefficient between the density variance and mean edge weight:
In [26]:
np.corrcoef(y_density_variances[:40], y_edge_means)[0,1]
Out[26]:
Confirmed: we have a very strong negative correlation between density variance and mean graph connectivity through the y-layers. However, note that it is not a perfect correlation. Mean edge weight tells us something slightly different than simple density variance. This could likely be due to a few properties of the RAG as we calculated it.
Thus, to points 2 and 3, it would be interesting to see how a different weighting function that is linear and normalized by layer looks. Perhaps we can see trends in the data that could act as a feature that not only shows how density and clustering changes through the cortical layers, but also discriminates between those layers.
In [27]:
y_edge_vars = []
for distrib in distributions:
y_edge_vars.append(np.var(distrib))
In [28]:
g = sns.barplot(x=range(len(y_edge_vars)), y=y_edge_vars, color='r')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Non-Linear Edge Weight Variances for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Varience")
sns.plt.show()
In [29]:
y_edge_skews = []
for distrib in distributions:
y_edge_skews.append(skew(distrib))
print y_edge_skews
In [30]:
g = sns.barplot(x=range(len(y_edge_skews)), y=y_edge_skews, color='r')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Non-Linear Edge Weight Skewness for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Skewness")
sns.plt.show()
In [31]:
y_edge_kurts = []
for distrib in distributions:
y_edge_kurts.append(kurtosis(distrib))
print y_edge_kurts
In [32]:
g = sns.barplot(x=range(len(y_edge_kurts)), y=y_edge_kurts, color='r')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Non-Linear Edge Weight Kurtosis for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Kurtosis")
sns.plt.show()
Hmmm...very interesting
In [35]:
y_rags_linear_weight = []
for layer in real_volume:
y_rags_linear_weight.append(generate_rag(layer, True))
In [36]:
test_rag = generate_rag(real_volume[4], True)
itty = test_rag.edges_iter()
weight_list = []
for index in range(test_rag.number_of_edges()):
eddy = itty.next()
weight_list.append(test_rag.get_edge_data(eddy[0], eddy[1])['weight'])
In [37]:
distributions_lin = get_edge_weight_distributions(y_rags_linear_weight)
# ignore edges
distributions_lin_nopad = distributions_lin
distributions_lin = distributions_lin[:40]
In [38]:
count = 0
for distr in distributions_lin:
plt.hist(distr, bins=150)
plt.title("Layer " + str(count) + " Linear Edge Weight Histogram")
plt.ylabel('Frequency')
plt.xlabel('Edge Weight')
plt.savefig("./figures/y_histogram_linear/Layer_" + str(count) + "_Edge_Weight_Y_lin")
plt.show()
count+=1
Note the very different shape of distributions. It might be interesting to see what the total edge weight distribution looks like for the full volume (all layers as one). Then we could compare the distributions generated by the two weighting functions.
In [39]:
concat_distr = []
for distr in distributions_lin:
concat_distr = concat_distr + distr
plt.hist(concat_distr, bins=150)
plt.title("Linear Edge Weight Histogram - All Layers")
plt.ylabel('Frequency')
plt.xlabel('Edge Weight')
plt.show()
Intersting. Now let's go back and compare that to the non-linear, unscaled weighting.
In [40]:
concat_distr2 = []
for distr in distributions:
concat_distr2 = concat_distr2 + distr
plt.hist(concat_distr2, bins=150)
plt.title("Non-Linear Edge Weight Histogram - All Layers")
plt.ylabel('Frequency')
plt.xlabel('Edge Weight')
plt.show()
Very interesting. Why would that behavior arise from a non-linear function? Is this a property of the data or just the math? Either way, let's continue on with other statistic through the layers generated by the the linearly weighted RAG.
In [41]:
distributions_lin_nopad
y_edge_linear_means_nopad = []
for distrib in distributions_lin_nopad:
y_edge_linear_means_nopad.append(np.mean(distrib))
g = sns.barplot(x=range(len(y_edge_linear_means_nopad)), y=y_edge_linear_means_nopad, color='b')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Linear, Scaled Edge Weight Means for Each Y Layer, Unpadded")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Mean")
sns.plt.show()
In [42]:
y_edge_linear_means = []
for distrib in distributions_lin:
y_edge_linear_means.append(np.mean(distrib))
g = sns.barplot(x=range(len(y_edge_linear_means)), y=y_edge_linear_means, color='b')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Linear, Scaled Edge Weight Means for Each Y Layer, Padded")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Mean")
sns.plt.show()
There are no apparent trends in this data as there were with the nonlinear weighting function. Let's look at the correlation to density variance for comparison.
In [43]:
np.corrcoef(y_density_variances[:40], y_edge_linear_means)[0,1]
Out[43]:
Visually, we can see significant peaks at layers 4, 9, 13, 22, and 32. Let's compare that to the density sum plot. Perhaps that corroborates the evidence we saw earlier from the layer-by-layer density sums.
In [44]:
y_sum = [0] * len(vol[0,:,0])
for i in range(len(vol[0,:,0])):
y_sum[i] = sum(sum(vol[:,i,:]))
g = sns.barplot(x=range(len(y_sum[:40])), y=y_sum[:40], color='g')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Density Sum by Y-Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Density Sum")
plt.show()
We see peaks at layers 9, 14, 21, 26, 31, and possibly 35. While the patterns in the edge weight means are not as consistent or noticable, the peaks are fairly close, especially in the first peaks. We could test this by looking at local minima and maxima analytically.
In [45]:
def local_minima(a):
return argrelextrema(a, np.less)
In [46]:
density_sum_minima = local_minima(np.array(y_sum))
density_sum_minima
Out[46]:
Let's visualize that on the plot we looked at earlier.
In [47]:
clrs = []
for i in range(len(y_sum)):
cnt = 0
clrs.append(sns.color_palette()[0])
for minima in density_sum_minima[0]:
if minima > i:
clrs[i] = sns.color_palette()[cnt%len(sns.color_palette())]
break
else:
cnt += 1
# clrs.append(sns.color_palette()[cnt%len(sns.color_palette())])
In [48]:
g = sns.barplot(x=range(len(y_sum[:40])), y=y_sum[:40], palette=clrs)
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Density Sum by Y-Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Density Sum")
plt.show()
Now let's go back and compare that with the minima on the edge weight means.
In [49]:
edge_mean_minima = local_minima(np.array(y_edge_linear_means))
edge_mean_minima
Out[49]:
There are quite a few local minima and they are spaced too close to each other. However, the maxima seem to be most defining of the mean edge length plot.
In [50]:
def local_mamima(a):
return argrelextrema(a, np.greater)
In [51]:
edge_mean_mamixa = local_mamima(np.array(y_edge_linear_means))
edge_mean_mamixa
Out[51]:
In [52]:
clrs = []
for i in range(len(y_edge_linear_means)):
cnt = 0
clrs.append(sns.color_palette()[0])
for maxima in edge_mean_mamixa[0]:
if maxima > i:
clrs[i] = sns.color_palette()[cnt%len(sns.color_palette())]
break
else:
cnt += 1
# clrs.append(sns.color_palette()[cnt%len(sns.color_palette())])
In [53]:
g = sns.barplot(x=range(len(y_edge_linear_means)), y=y_edge_linear_means, palette=clrs)
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Linear, Scaled Edge Weight Means for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Mean")
sns.plt.show()
In [56]:
y_edge_linear_vars = []
for distrib in distributions_lin:
y_edge_linear_vars.append(np.var(distrib))
g = sns.barplot(x=range(len(y_edge_linear_vars)), y=y_edge_linear_vars, color='b')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Linear Edge Weight Variance for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Variance")
sns.plt.show()
In [60]:
y_edge_linear_skews = []
for distrib in distributions_lin:
y_edge_linear_skews.append(skew(distrib))
g = sns.barplot(x=range(len(y_edge_linear_skews)), y=y_edge_linear_skews, color='b')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Linear Edge Weight Skewness for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Skewness")
sns.plt.show()
In [61]:
y_edge_linear_kurts = []
for distrib in distributions_lin:
y_edge_linear_kurts.append(kurtosis(distrib))
g = sns.barplot(x=range(len(y_edge_linear_kurts)), y=y_edge_linear_kurts, color='b')
g.set_xticks(g.get_xticks()[::2])
plt.xticks(g.get_xticks(), g.get_xticks())
plt.title("Linear Edge Weight Kurtosis for Each Y Layer")
plt.xlabel("Y-Axis Layer")
plt.ylabel("Edge Weight Kurtosis")
sns.plt.show()
In [ ]:
num_self_loops = []
for rag in y_rags:
num_self_loops.append(rag.number_of_selfloops())
In [ ]:
num_self_loops
Interesting. There are no self loops. Why would this be? Let's come back to this. In the meantime, I want to give some though to what it means to have a self loop, whether it should be theoretically possible given our data, and whether our graphs are formed properly.
The answer to this question is very simple. In a RAG, there are no self-loops by definition. Self loops are edges that form a connection between a node and itself.
To see whether the graphs are formed properly, let's look at an adjacency lists:
In [ ]:
# y_rags[0].adjacency_list()
Compare that to the test data:
In [ ]:
# Test Data
test = np.array([[1,2],[3,4]])
test_rag = skimage.future.graph.RAG(test)
test_rag.adjacency_list()
In [ ]:
In [ ]:
real_volume_x = np.zeros((len(sorted_x), len(sorted_y), len(sorted_z)))
for r in rows:
real_volume_x[ sorted_x.index(r[0]), sorted_y.index(r[1]), sorted_z.index(r[2])] = r[-1]
In [ ]:
x_rags = []
count = 0;
for layer in real_volume_x:
count = count + 1
x_rags.append(skimage.future.graph.RAG(layer))
In [ ]:
num_edges_x = []
for rag in x_rags:
num_edges_x.append(rag.number_of_edges())
In [ ]:
sns.barplot(x=range(len(num_edges_x)), y=num_edges_x)
sns.plt.show()
We can see here the number of edges is low in that area that does not have many synapses. It, as expected, mirrors the distribution of synapses. It appears to be approximately uniform at the top, with buffers of very few synapses on the sides. Remember from here:
In [ ]:
plt.imshow(np.amax(real_volume, axis=2), interpolation='nearest')
plt.show()
In [ ]:
# edge_length_list[3]
# tri_area_list[3]
# triangles
In [ ]:
# Note for future
# del_features['d_edge_length_mean'] = np.mean(edge_lengths)
# del_features['d_edge_length_std'] = np.std(edge_lengths)
# del_features['d_edge_length_skew'] = scipy.stats.skew(edge_lengths)
# del_features['d_edge_length_kurtosis'] = scipy.stats.kurtosis(edge_lengths)