In [1]:
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import urllib2
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
'/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)
print csv.shape
In [2]:
print np.sum(csv[:,-1])
In [3]:
# Histogram
fig = plt.figure()
ax = fig.gca()
plt.hist(csv[:,4])
ax.set_title('Synapse Density')
ax.set_xlabel('Number of Synapses')
ax.set_ylabel('Number of (x,y,z) points with synapse density = x')
plt.show()
In [4]:
corr = np.corrcoef(csv[:,3],csv[:,4])
print corr
In [5]:
# this is a pretty broad question, but if we plot a random sample of data points where synapses exists,
# we can get a vague idea of how they're structured, we can furthermore restrict which data poitns get sampled
# by only taking ones where the number of synapses is greater than the average
samples = 2500 # how many samples to draw
def check_condition(row):
if row[-1] == 0:
return False
return True
def synapse_filt(row, avg):
if row[-1] > avg:
return True
return False
a = np.apply_along_axis(check_condition, 1, csv)
a = np.where(a == True)[0]
nonzero_rows = csv[a, :]
avg_synapse = np.mean(nonzero_rows[:, -1])
filter_avg_synapse = np.apply_along_axis(synapse_filt, 1,
nonzero_rows, avg_synapse)
a = np.where(filter_avg_synapse == True)[0]
nonzero_filtered = nonzero_rows[a, :]
xyz_only = nonzero_filtered[:, [0, 1, 2]]
# randomly sample
perm = np.random.permutation(xrange(1, len(xyz_only[:])))
xyz_only = xyz_only[perm[:samples]]
# get range for graphing
mins = [np.amin(xyz_only[:, i]) for i in xrange(3)]
maxs = [np.amax(xyz_only[:, i]) for i in xrange(3)]
# following code adopted from
# https://www.getdatajoy.com/examples/python-plots/3d-scatter-plot
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.set_title('3D Scatter Plot')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_xlim(mins[0], maxs[0])
ax.set_ylim(mins[1], maxs[1])
ax.set_zlim(mins[2], maxs[2])
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
xyz_only[:, 0], xyz_only[:, 1], xyz_only[:, 2], # data
c='purple', # marker colour
marker='o', # marker shape
s=15 # marker size
)
plt.show()
In [6]:
# run k-means on the data filtered like it was previously (only rows where synapses are greater than average)
# doing this to speed up computation time and to limit the number of data points that we will randomly sample
# from when we graph the clusters
import sklearn.cluster as cluster
def graph_cluster(xyz_only, clusters, centers, k):
# randomly sample
samples = 2500
perm = np.random.permutation(xrange(1, len(xyz_only[:])))
xyz_only = xyz_only[perm[:samples]]
clusters = clusters[perm[:samples]]
# get range for graphing
mins = [np.amin(xyz_only[:, i]) for i in xrange(3)]
maxs = [np.amax(xyz_only[:, i]) for i in xrange(3)]
# following code adopted from
# https://www.getdatajoy.com/examples/python-plots/3d-scatter-plot
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.set_title('K-means, k='+str(k))
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_xlim(mins[0], maxs[0])
ax.set_ylim(mins[1], maxs[1])
ax.set_zlim(mins[2], maxs[2])
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
xyz_only[:, 0], xyz_only[:, 1], xyz_only[:, 2], # data
c=clusters, # marker colour
marker='o', # marker shape
s=15 # marker size
)
plt.show()
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.set_title('Centers, k='+str(k))
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_xlim(mins[0], maxs[0])
ax.set_ylim(mins[1], maxs[1])
ax.set_zlim(mins[2], maxs[2])
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
centers[:, 0], centers[:, 1], centers[:, 2], # data
c='blue', # marker colour
marker='o', # marker shape
s=15 # marker size
)
plt.show()
n_clusters = 4
kmeans_algo = cluster.KMeans(n_clusters=n_clusters)
clusters = kmeans_algo.fit_predict(xyz_only)
centers = kmeans_algo.cluster_centers_
print centers
graph_cluster(xyz_only, clusters, centers, n_clusters)
In [7]:
n_clusters = 5
kmeans_algo = cluster.KMeans(n_clusters=n_clusters)
clusters = kmeans_algo.fit_predict(xyz_only)
centers = kmeans_algo.cluster_centers_
print centers
graph_cluster(xyz_only, clusters, centers, n_clusters)
In [8]:
n_clusters = 10
kmeans_algo = cluster.KMeans(n_clusters=n_clusters)
clusters = kmeans_algo.fit_predict(xyz_only)
centers = kmeans_algo.cluster_centers_
print centers
graph_cluster(xyz_only, clusters, centers, n_clusters)
In [9]:
# PMF
syns = csv[:,4]
sum = np.sum(syns)
density = syns/sum
mean = np.mean(density)
print mean
# get rid of unmasked column
no_unmasked = csv[:, [0, 1, 2, 4]]
# python crashes when trying to get covariance matrix (line below)
# print np.cov(no_unmasked)
In [10]:
print [min(csv[:,1]),min(csv[:,2]),min(csv[:,3])] #(x,y,z) minimum
print [max(csv[:,1]),max(csv[:,2]),max(csv[:,3])] #(x,y,z) maximum
In [11]:
# Max number of synapses
max_syn = np.argmax(nonzero_rows[:,4])
print max_syn
loc = (nonzero_rows[max_syn,0],nonzero_rows[max_syn,1],nonzero_rows[max_syn,2]);
print loc