In [1]:
#Clean syn density data
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
import urllib2
import numpy as np
np.set_printoptions(precision=3, suppress=True)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
'/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)
#x_bounds = (409, 3529), starts at 19
#y_bounds = (1564, 3124), starts at 1369
csv_clean = csv[np.logical_not(csv[:,3] == 0)] # get rid of unmasked values of zero
csv_clean = csv_clean[csv_clean[:,0] >= 409]
csv_clean = csv_clean[csv_clean[:,0] <= 3529]
csv_clean = csv_clean[csv_clean[:,1] >= 1564]
csv_clean = csv_clean[csv_clean[:,1] <= 3124]
csv_clean_no_ratio = csv_clean
csv_clean[:,4] = np.divide(csv_clean[:,4],csv_clean[:,3])
csv_clean[:,4] = csv_clean[:,4]*(64**3)
In [2]:
print 'here'
def check_condition(row):
if row[3] == 0:
return False
return True
a = np.apply_along_axis(check_condition, 1, csv)
a = np.where(a == True)[0]
nonZeroMask = csv[a, :]
synDividedMask = np.divide(nonZeroMask[:,4],nonZeroMask[:,3])
synDividedMask = synDividedMask * (64**3)
accurateDataT = np.vstack((nonZeroMask[:,0],nonZeroMask[:,1],nonZeroMask[:,2],synDividedMask))
accurateData = accurateDataT.T
plt.hist(accurateData[:,3],bins=100)
plt.show()
In [3]:
cleaned = accurateData[accurateData[:,0] >= 409]
cleaned = cleaned[cleaned[:,0] <= 3529]
cleaned = cleaned[cleaned[:,1] >= 1564]
cleaned = cleaned[cleaned[:,1] <= 3124]
plt.hist(cleaned[:,3], bins=100)
plt.show()
In [1]:
print 'hello world'
This looks like there is an optimal density for synapse/unmasked values at 340 or so.
In [4]:
# chopping data based on thresholds on x and y coordinates
x_bounds = (409, 3529)
y_bounds = (1564, 3124)
def check_in_bounds(row, x_bounds, y_bounds):
if row[0] < x_bounds[0] or row[0] > x_bounds[1]:
return False
if row[1] < y_bounds[0] or row[1] > y_bounds[1]:
return False
if row[3] == 0:
return False
return True
indices_in_bound, = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_thresholded = csv[indices_in_bound]
n = data_thresholded.shape[0]
total_unmasked = np.sum(data_thresholded[:, 3])
total_syn = np.sum(data_thresholded[:, 4])
print "avg density per voxel: ", total_syn/total_unmasked
a = np.apply_along_axis(lambda x:x[4]/x[3], 1, data_thresholded)
print "average per bin: ", np.average(a), ", std dev: ", np.std(a)
print "max/min bin density: ", np.max(a), ", ", np.min(a)
print np.sum(a)
hist_n, bins, _ = plt.hist(a, 5000)
plt.xlim(-.0001, .0035)
plt.show()
In [5]:
print "fraction: ", hist_n[0]/len(a)
ux = np.unique(data_thresholded[:, 0]).shape[0]
uy = np.unique(data_thresholded[:, 1]).shape[0]
uz = np.unique(data_thresholded[:, 2]).shape[0]
exp = ux*uy*uz
actual = data_thresholded.shape[0]
print "actual: %d, expected: %d, difference: %d" % (actual, exp, exp-actual)
In [7]:
# chi squared test on all bins
import scipy.stats as stats
def synapses_over_unmasked(row):
s = (row[4]/row[3])*(64**3)
return [row[0], row[1], row[2], s]
syn_unmasked = np.apply_along_axis(synapses_over_unmasked, 1, data_thresholded)
# divide synapses/unmasked by std_dev
# syn_normalized = np.apply_along_axis(normalize_syn, 1, syn_unmasked,
# np.mean(syn_unmasked[:,-1]), np.std(syn_unmasked[:,-1]))
syn_normalized = syn_unmasked
sum_syn_norm = np.sum(syn_normalized[:, 3])
avg_syn_norm = (sum_syn_norm/n)*np.ones((n))
syn_norm_1col = syn_normalized[:, -1]
print avg_syn_norm
print syn_norm_1col
print stats.chisquare(syn_norm_1col, avg_syn_norm)
In [8]:
def marginalize_along_axis(axis):
unique = np.unique(syn_normalized[:, axis])
idx_dict = dict(zip(unique, range(len(unique))))
syn_per_unique = np.zeros(len(unique))
for row in syn_normalized[:,]:
syn_per_unique[idx_dict[row[axis]]] += row[-1]
return syn_per_unique
for axis in range(3):
marginalized_data = marginalize_along_axis(axis)
n = len(np.unique(syn_normalized[:, axis]))
print "n=", n
avg = sum_syn_norm/n
avg_vec = np.ones((n))*avg
print "axis = ", axis, "chisquare test: ", stats.chisquare(marginalized_data, avg_vec)
In [9]:
fig, ((ax1, ax2, ax3)) = plt.subplots(1,3)
ax1.contourf(csv_clean[:,(0,1,4)].T,20)
ax2.contourf(csv_clean[:,(0,2,4)].T,20)
ax3.contourf(csv_clean[:,(1,2,4)].T,20)
plt.show()
In [10]:
import scipy.stats as stats
#KDE x,syn
data_clean = csv_clean[:,(0,4)].T
# randomly sample
samples = 20000
perm = np.random.permutation(xrange(1, len(data_clean[:])))
rs_data_clean = data_clean[perm[:20000]]
kde = stats.gaussian_kde(data_clean) # KDE for x,synapses
density = kde(data_clean)
fig1 = plt.figure(figsize=(12,6))
x, syn = data_clean
plt.scatter(x, syn, c=density)
plt.show()
#KDE y,syn
fig2 = plt.figure(figsize=(12,6))
data_clean = csv_clean[:,(1,4)].T
# randomly sample
samples = 20000
perm = np.random.permutation(xrange(1, len(data_clean[:])))
rs_data_clean = data_clean[perm[:20000]]
kde = stats.gaussian_kde(data_clean) # KDE for y,synapses
density = kde(data_clean)
y, syn = data_clean
plt.scatter(x, syn, c=density)
plt.show()
#KDE z,syn
fig3 = plt.figure(figsize=(12,6))
data_clean = csv_clean[:,(1,4)].T
# randomly sample
samples = 20000
perm = np.random.permutation(xrange(1, len(data_clean[:])))
rs_data_clean = data_clean[perm[:20000]]
kde = stats.gaussian_kde(data_clean) # KDE for z,synapses
density = kde(data_clean)
z, syn = data_clean
plt.scatter(x, syn, c=density)
plt.show()
In [11]:
import sklearn.mixture as mixture
max_clusters = 15
bic = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
#print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = mixture.GMM(n_components=idx,n_iter=1000,covariance_type='diag', random_state=1)
gmm.fit(csv_clean[:,(0,1,2,4)])
bic = np.append(bic, gmm.bic(csv_clean[:,(0,1,2,4)]))
#print 'Statistics:' + '\nWeights: ' + str(gmm.weights_) + '\nMeans: ' + str(gmm.means_) + '\nCovariances: ' + str(gmm.covars_)
print bic
plt.figure(figsize=(7,7))
plt.plot(i, 1.0/bic)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()
Observations:
In [12]:
# to keep model as simple as possible, let's assume 4 clusters
n_clusters = 4
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
predicted = gmm.fit_predict(syn_normalized)
for label, row in zip(predicted, syn_normalized[:,]):
clusters[label].append(row)
for i in xrange(n_clusters):
clusters[i] = np.array(clusters[i])
print "# of samples in cluster %d: %d" % (i+1, len(clusters[i]))
print "centroid: ", np.average(clusters[i], axis=0)
print "cluster covariance: "
covar = np.cov(clusters[i].T)
print covar
print "determinant of covariance matrix: ", np.linalg.det(covar)
print
In [13]:
# check if uniform distribution within cluster
for cluster in clusters:
sum_syn = np.sum(cluster[:, -1])
avg_syn_vec = (sum_syn/cluster.shape[0])*np.ones((cluster.shape[0]))
print stats.chisquare(cluster[:, -1], avg_syn_vec)
In [15]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
syn_normalized[:, 0], syn_normalized[:, 1], syn_normalized[:, 2], # data
c=predicted, # marker colour
alpha=.5
)
plt.show()
In [16]:
max_clusters = 15
csv_clean[:,4] = np.divide(csv_clean[:,4],64**3)
bicx = np.array([])
bicy = np.array([])
bicz = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = mixture.GMM(n_components=idx,n_iter=1000,covariance_type='diag', random_state = 1)
gmm.fit(csv_clean[:,(0,4)])
bicx = np.append(bicx, gmm.bic(csv_clean[:,(0,4)]))
gmm.fit(csv_clean[:,(1,4)])
bicy = np.append(bicy, gmm.bic(csv_clean[:,(1,4)]))
gmm.fit(csv_clean[:,(2,4)])
bicz = np.append(bicz, gmm.bic(csv_clean[:,(2,4)]))
plt.figure(figsize=(7,7))
plt.plot(i, 1.0/bicx, i, 1.0/bicy, i, 1.0/bicz)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.legend(['clustered along x','clustered along y','clustered along z'],loc='lower right')
plt.show()
csv_clean[:,4] = csv_clean[:,4] * 64**3
In [17]:
# Check for uniformity in clusters along z-direction
# suggested # of clusters by BIC curve says clusters fairly consistent across z at n=4 clusters
# Step size for z: 1st value 55, increments of 111
plt.figure(figsize=(7,7))
divisions = np.unique(csv_clean[:,2])
for d in divisions:
z_layer = csv_clean[csv_clean[:,2] == d]
#Run GMM on layer
print 'Running GMM on layer ' + str(d)
max_clusters = 15
bic = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
#print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = mixture.GMM(n_components=idx,n_iter=1000,covariance_type='diag', random_state=1)
gmm.fit(z_layer[:,(0,1,4)])
bic = np.append(bic, gmm.bic(z_layer[:,(0,1,4)]))
#print bic
plt.plot(i, 1.0/bic)
plt.hold(True)
plt.title('BIC for each Z-layer')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.legend(divisions)
plt.show()
In [18]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D
def graph_cluster(xyz_only, clusters, centers, k):
# randomly sample
samples = 10000
perm = np.random.permutation(xrange(1, len(xyz_only[:])))
xyz_only = xyz_only[perm[:samples]]
clusters = clusters[perm[:samples]]
# get range for graphing
mins = [np.amin(xyz_only[:, i]) for i in xrange(3)]
maxs = [np.amax(xyz_only[:, i]) for i in xrange(3)]
# following code adopted from
# https://www.getdatajoy.com/examples/python-plots/3d-scatter-plot
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.set_title('K-means, k='+str(k))
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_xlim(mins[0], maxs[0])
ax.set_ylim(mins[1], maxs[1])
ax.set_zlim(mins[2], maxs[2])
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
xyz_only[:, 0], xyz_only[:, 1], xyz_only[:, 2], # data
c=clusters, # marker colour
marker='o', # marker shape
s=50 # marker size
)
plt.show()
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.set_title('Centers, k='+str(k))
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_xlim(mins[0], maxs[0])
ax.set_ylim(mins[1], maxs[1])
ax.set_zlim(mins[2], maxs[2])
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
centers[:, 0], centers[:, 1], centers[:, 2], # data
c='blue', # marker colour
marker='o', # marker shape
s=100 # marker size
)
plt.show()
n_clusters = 4
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(csv_clean[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers
graph_cluster(csv_clean[:,[0,1,2]], clusters, centers, n_clusters)
In [19]:
n_clusters = 3
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(csv_clean[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers
graph_cluster(csv_clean[:,[0,1,2]], clusters, centers, n_clusters)
BIC(GMM)/PCA (analysis by Jay) both predicted 4 clusters, but it seems that 3 clusters looks a lot better. Some bic curves predicted optimal number of clusters to be 3 z-layers. For what we know about anatomy of brain, zones of brain usually divided by depth, don't usually see parting division we see in n=4. Later, we will investigate why 3 clusters looks so much better and more reasonable than 4 clusters.
Visually, the k-means clustering shows us some interesting data. It looks like 3 clusters might be the correct number.
Let's try to find a general trends in pathing from center to center of the clusters. This might give us an estimate for direction of change in clusters
In [20]:
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
for d in divisions:
z_layer = csv_clean[csv_clean[:,2] == d]
#Run GMM on layer
print 'Running GMM on layer ' + str(d)
print "Fitting and evaluating model with 3 clusters."
gmm = mixture.GMM(n_components=3,n_iter=1000,covariance_type='diag', random_state=1)
gmm.fit(z_layer[:,(0,1,4)])
center1 = [gmm.means_[0][0],gmm.means_[0][1],gmm.means_[0][2]]
center2 = [gmm.means_[1][0],gmm.means_[1][1],gmm.means_[1][2]]
center3 = [gmm.means_[2][0],gmm.means_[2][1],gmm.means_[2][2]]
ax.scatter(center1, center2, center3, marker='o', s=100)
plt.hold(True)
plt.title('BIC for each Z-layer')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.legend(divisions)
plt.show()
In [21]:
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
for d in divisions:
z_layer = csv_clean[csv_clean[:,2] == d]
#Run GMM on layer
print 'Running GMM on layer ' + str(d)
print "Fitting and evaluating model with 4 clusters."
gmm = mixture.GMM(n_components=4,n_iter=1000,covariance_type='diag', random_state=1)
gmm.fit(z_layer[:,(0,1,4)])
center1 = [gmm.means_[0][0],gmm.means_[0][1],gmm.means_[0][2]]
center2 = [gmm.means_[1][0],gmm.means_[1][1],gmm.means_[1][2]]
center3 = [gmm.means_[2][0],gmm.means_[2][1],gmm.means_[2][2]]
center4 = [gmm.means_[3][0],gmm.means_[3][1],gmm.means_[3][2]]
ax.scatter(center1, center2, center3, center4, marker='o', s=100)
plt.hold(True)
plt.title('BIC for each Z-layer')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.legend(divisions)
plt.show()
The lower z-values behave differently from the other sections. Those are probably the z balue BIC curves that had elbows at 3. Overall, it looks like 3 clusters behaves more reasonably, but we should do further analysis.
Let's try to refine the gradient of synaptic density across axes (Jay's analysis).
In [22]:
syn_normalized = syn_unmasked
uniques = [np.unique(syn_normalized[:, i]) for i in xrange(3)]
coord_mapping = {}
for xi, x in enumerate(uniques[0]):
for yi, y in enumerate(uniques[1]):
for zi, z in enumerate(uniques[2]):
coord_mapping[(x, y, z)] = (xi, yi, zi)
gridded_data = np.empty((len(uniques[0]), len(uniques[1]), len(uniques[2])))
for row in syn_normalized[:, ]:
coord = coord_mapping[tuple(row[:3])]
gridded_data[coord[0], coord[1], coord[2]] = row[-1]
dx = uniques[0][1]-uniques[0][0]
dy = uniques[1][1]-uniques[1][0]
dz = uniques[2][1]-uniques[2][0]
grad = np.gradient(gridded_data, dx, dy, dz)
def get_gradient_components(x, y, z):
u = grad[0][x, y, z]
v = grad[1][x, y, z]
w = grad[2][x, y, z]
return (u, v, w)
x, y, z = np.meshgrid(np.arange(1, 41, 4), np.arange(1, 81, 8), np.arange(1, 11))
fig = plt.figure(figsize=(10, 10))
ax = fig.gca(projection='3d')
ax.quiver(x, y, z, grad[0][1::8, 1::4, 1:], grad[1][1::8, 1::4, 1:], grad[2][1::8, 1::4, 1:], length=2, color="Tomato", alpha=.8,
arrow_length_ratio=.4)
ax.view_init(elev=18, azim=30)
ax.dist = 8
plt.show()
In [23]:
print "Avg. gradient: (%.4f, %.4f, %.4f)" % (np.average(grad[0]), np.average(grad[1]), np.average(grad[2]))
print "Std. dev per element: (%.4f, %.4f, %.4f)" % (np.std(grad[0]), np.std(grad[1]), np.std(grad[2]))
In [24]:
from sklearn.decomposition import PCA
# center each variable and give it unit variance for PCA
def center(row, means, std_devs):
for idx, mean, std_dev in zip(range(4), means, std_devs):
row[idx] -= mean
row[idx] *= 1.0/std_dev
return row
syn_centered = np.apply_along_axis(center, 1, syn_normalized,
*zip(*[(np.average(syn_normalized[:, i]),
np.std(syn_normalized[:, i])) for i in range(4)]))
pca = PCA(n_components = 4)
transform = pca.fit_transform(syn_centered)
print pca.components_
print pca.explained_variance_ratio_
print transform.shape
# plot the clusters along the first 2 principal components
n_clusters = 4
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
predicted = gmm.fit_predict(syn_normalized)
plt.scatter(transform[:, 0], transform[:, 1], c=predicted, alpha=.3)
plt.show()
Observe fairly well defined boundary between clusters. Lets plot the 2D PCA when there are 3 and 12 clusters.
In [25]:
n_clusters = 3
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
predicted = gmm.fit_predict(syn_normalized)
plt.scatter(transform[:, 0], transform[:, 1], c=predicted, alpha=.3)
plt.show()
n_clusters = 12
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
predicted = gmm.fit_predict(syn_normalized)
plt.scatter(transform[:, 0], transform[:, 1], c=predicted, alpha=.3)
plt.show()
For larger clusters, boundaries remain pretty well defined, although they are not very apparent for small clusters.
In [26]:
# Regression (x,y,z,syn/unmasked) on cleaned data ##################################
# Load regressions
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
LinearSVR(C=1.0),
KNN(n_neighbors=10, algorithm='auto'),
RF(max_depth=5, max_features=1),
Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
k_fold = 10
print('Regression on X=(x,y,z), Y=syn/unmasked')
X = csv_clean[:, (0, 1, 2)] # x,y,z
Y = csv_clean[:, 4] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
In [27]:
# x -> syn/unmasked
print('Regression on X=x, Y=syn/unmasked')
X = csv_clean[:,[0]] # x,y,z
Y = csv_clean[:,4] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# y -> syn/unmasked
print('Regression on X=y, Y=syn/unmasked')
X = csv_clean[:,[1]] # x,y,z
Y = csv_clean[:,4] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# z -> syn/unmasked
print('Regression on X=z, Y=syn/unmasked')
X = csv_clean[:,[2]] # x,y,z
Y = csv_clean[:,4] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
In [ ]: