In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import urllib2
from __future__ import division
plt.style.use('ggplot')
np.random.seed(1)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
'/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
data = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)
# convert to density
data = np.array([row for row in data if row[-2] != 0])
data[:, -2] = data[:, -1]/data[:, -2]
data = data[:,:-1]
print "density mean: ", np.average(data[:, -1])
print "density std dev: ", np.std(data[:, -1])
# normalize
means = [np.average(data[:, i]) for i in range(4)]
std_devs = [np.std(data[:, i]) for i in range(4)]
for i in range(4):
data[:, i] -= means[i]
data[:, i] /= std_devs[i]
print data
Bock 2011 paper claims they cut 4 different layers, so we will do k-means w/ 4 clusters.
In [2]:
from sklearn import cluster
kmeans1 = cluster.KMeans(4)
kmeans1.fit_predict(data)
print kmeans1.cluster_centers_
Note pairs of clusters with similar mean densities and y-values.
Since we have seen numerous times that it appears layers change along y-axis, lets cluster our data based only on density and y value and then compare these to the previous clustering.
In [3]:
data_yd = data[:, (1, 3)]
kmeans2 = cluster.KMeans(4)
kmeans2.fit_predict(data_yd)
print kmeans2.cluster_centers_
This clustering result is more along the lines of what we expected, as it partitions by different y-values and densities. Let's make some graphs of the previous two clusters.
In [4]:
colors = ['b', 'g', 'r', 'c', 'm']
for i, c in zip(range(4), colors):
a = np.where(kmeans2.labels_ == i)
plt.scatter(data[a, 1], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('y coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('KMeans clustering, k=4, only y-coord and density considered')
plt.show()
print kmeans2.cluster_centers_
In [13]:
for i, c in zip(range(4), colors):
a = np.where(kmeans1.labels_ == i)
plt.scatter(data[a, 1], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('y coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('Kmeans clusters, k=4, x,y,z coords and density considered')
plt.show()
for i, c in zip(range(4), colors):
a = np.where(kmeans1.labels_ == i)
plt.scatter(data[a, 0], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('x coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('Kmeans clusters, k=4, x,y,z coords and density considered')
plt.show()
print kmeans1.cluster_centers_
Observe the similarity in clusters when projected onto the y-axis. Also note difference in x and z coordinate means for the first 2 and second 2 clusters. Let's try kmeans with k=2.
In [6]:
kmeans3 = cluster.KMeans(2)
kmeans3.fit_predict(data)
for i, c in zip(range(2), colors):
a = np.where(kmeans3.labels_ == i)
plt.scatter(data[a, 1], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('y coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('KMeans clustering, k=2, xyz and density considered')
plt.show()
print kmeans3.cluster_centers_
kmeans4 = cluster.KMeans(2)
kmeans4.fit_predict(data[:, (1, 3)])
for i, c in zip(range(2), colors):
a = np.where(kmeans4.labels_ == i)
plt.scatter(data[a, 1], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('y coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('KMeans clustering, k=2, y coord and density considered')
plt.show()
print kmeans4.cluster_centers_
Fit 2 gaussians...
In [7]:
from sklearn import mixture
gmm = mixture.GMM(2)
labels = gmm.fit_predict(data)
print gmm.means_
print "compare with kmeans, k=2, all coordinates"
print kmeans3.cluster_centers_
for i, c in zip(range(2), colors):
a = np.where(labels == i)
plt.scatter(data[a, 1], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('y coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('2 Component Gaussian Mixture Model Prediction')
plt.show()
In [12]:
# now do GMM w/ 4 clusters, since Bock 2011 suggests 4 layers
gmm = mixture.GMM(4)
labels = gmm.fit_predict(data)
for i, c in zip(range(4), colors):
a = np.where(labels == i)
plt.scatter(data[a, 1], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('y coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('4 Component Gaussian Mixture Model Prediction')
plt.show()
for i, c in zip(range(4), colors):
a = np.where(labels == i)
plt.scatter(data[a, 0], data[a, -1],
alpha = .2,
color = c,
label='cluster #' + str(i))
plt.legend(bbox_to_anchor=(1.4, 1.00))
plt.xlabel('x coordinate (normalized)')
plt.ylabel('synaptic density (normalized)')
plt.title('4 Component Gaussian Mixture Model Prediction')
plt.show()
print gmm.means_
In [9]:
print np.cov(data[:, 1], data[:, -1])
Some negative correlation between density and y coordinate, although not extreme. Let's compare the correlation for each coordinate
In [10]:
for i, coord in enumerate(['cx', 'cy', 'cz']):
print "correlation between density and " + coord
print np.cov(data[:, i], data[:, -1])
As expected, absolute value of correlation with density is maximal for y coordinate.
In [ ]: