In [1]:
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import urllib2
import scipy.stats as stats
np.set_printoptions(precision=3, suppress=True)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
'/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)
# chopping data based on thresholds on x and y coordinates
x_bounds = (409, 3529)
y_bounds = (1564, 3124)
def check_in_bounds(row, x_bounds, y_bounds):
if row[0] < x_bounds[0] or row[0] > x_bounds[1]:
return False
if row[1] < y_bounds[0] or row[1] > y_bounds[1]:
return False
if row[3] == 0:
return False
return True
indices_in_bound, = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_thresholded = csv[indices_in_bound]
n = data_thresholded.shape[0]
In [2]:
total_unmasked = np.sum(data_thresholded[:, 3])
total_syn = np.sum(data_thresholded[:, 4])
print "avg density per voxel: ", total_syn/total_unmasked
a = np.apply_along_axis(lambda x:x[4]/x[3], 1, data_thresholded)
print "average per bin: ", np.average(a), ", std dev: ", np.std(a)
print "max/min bin density: ", np.max(a), ", ", np.min(a)
print np.sum(a)
hist_n, bins, _ = plt.hist(a, 5000)
plt.xlim(-.0001, .0035)
plt.show()
Note the 2 large spikes in the histogram.
In [3]:
print "fraction: ", hist_n[0]/len(a)
ux = np.unique(data_thresholded[:, 0]).shape[0]
uy = np.unique(data_thresholded[:, 1]).shape[0]
uz = np.unique(data_thresholded[:, 2]).shape[0]
exp = ux*uy*uz
actual = data_thresholded.shape[0]
print "actual: %d, expected: %d, difference: %d" % (actual, exp, exp-actual)
In [4]:
# chi squared test on all bins
def synapses_over_unmasked(row):
s = (row[4]/row[3])*(64**3)
return [row[0], row[1], row[2], s]
syn_unmasked = np.apply_along_axis(synapses_over_unmasked, 1, data_thresholded)
# divide synapses/unmasked by std_dev
# syn_normalized = np.apply_along_axis(normalize_syn, 1, syn_unmasked,
# np.mean(syn_unmasked[:,-1]), np.std(syn_unmasked[:,-1]))
syn_normalized = syn_unmasked
sum_syn_norm = np.sum(syn_normalized[:, 3])
avg_syn_norm = (sum_syn_norm/n)*np.ones((n))
syn_norm_1col = syn_normalized[:, -1]
print avg_syn_norm
print syn_norm_1col
print stats.chisquare(syn_norm_1col, avg_syn_norm)
Conclude that the joint distribution is not uniform.
In [5]:
def marginalize_along_axis(axis):
unique = np.unique(syn_normalized[:, axis])
idx_dict = dict(zip(unique, range(len(unique))))
syn_per_unique = np.zeros(len(unique))
for row in syn_normalized[:,]:
syn_per_unique[idx_dict[row[axis]]] += row[-1]
return syn_per_unique
for axis in range(3):
marginalized_data = marginalize_along_axis(axis)
n = len(np.unique(syn_normalized[:, axis]))
print "n=", n
avg = sum_syn_norm/n
avg_vec = np.ones((n))*avg
print "axis = ", axis, "chisquare test: ", stats.chisquare(marginalized_data, avg_vec)
Conclude that none of the marginals are uniform.
In [6]:
# load our regressions
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
np.random.seed(1)
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
LinearSVR(C=1.0),
KNN(n_neighbors=10, algorithm='auto'),
RF(max_depth=5, max_features=1),
Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
k_fold = 10
def normalize_syn(row, pos, scale):
row[-1] -= pos
row[-1] *= 1.0/scale
return row
"""
syn_normalized = np.apply_along_axis(normalize_syn, 1, syn_unmasked,
np.mean(syn_unmasked[:,-1]), np.std(syn_unmasked[:,-1]))
syn_normalized = np.apply_along_axis(normalize_syn, 1, syn_unmasked,
np.min(syn_unmasked[:,-1]),
np.max(syn_unmasked[:,-1])-np.min(syn_unmasked[:,-1]))
"""
syn_normalized = syn_unmasked
X = syn_normalized[:, [0, 1, 2]]
Y = syn_normalized[:, -1]
for name, reg in zip(names, regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
Overall, regressions not successful.
In [7]:
for i in xrange(3):
X = syn_normalized[:, i].reshape(-1, 1)
Y = syn_normalized[:, -1]
print i
for name, reg in zip(names, regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
print
In [8]:
import sklearn.mixture as mixture
max_clusters = 15
bic = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
gmm = mixture.GMM(n_components=idx, n_iter=1000, covariance_type='diag', random_state=1)
gmm.fit(syn_normalized)
bic = np.append(bic, gmm.bic(syn_normalized))
print bic
plt.figure(figsize=(7,7))
plt.plot(i, 1.0/bic)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()
Observe local maximums at 4 clusters and 12 clusters.
In [9]:
# to keep model as simple as possible, let's assume 4 clusters
n_clusters = 4
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
predicted = gmm.fit_predict(syn_normalized)
for label, row in zip(predicted, syn_normalized[:,]):
clusters[label].append(row)
for i in xrange(n_clusters):
clusters[i] = np.array(clusters[i])
print "# of samples in cluster %d: %d" % (i+1, len(clusters[i]))
print "centroid: ", np.average(clusters[i], axis=0)
print "cluster covariance: "
covar = np.cov(clusters[i].T)
print covar
print "determinant of covariance matrix: ", np.linalg.det(covar)
print
Observations:
In [10]:
# compare diagonal covariances computed to whats returned by the GMM
print gmm.covars_
In [11]:
# check if uniform distribution within cluster
for cluster in clusters:
sum_syn = np.sum(cluster[:, -1])
avg_syn_vec = (sum_syn/cluster.shape[0])*np.ones((cluster.shape[0]))
print stats.chisquare(cluster[:, -1], avg_syn_vec)
In [12]:
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.view_init()
ax.dist = 10 # distance
ax.scatter(
syn_normalized[:, 0], syn_normalized[:, 1], syn_normalized[:, 2], # data
c=predicted, # marker colour
alpha=.5
)
plt.show()
#TODO: graph the centroids and diagonal covariances as ellipsoids
In [13]:
syn_normalized = syn_unmasked
uniques = [np.unique(syn_normalized[:, i]) for i in xrange(3)]
coord_mapping = {}
for xi, x in enumerate(uniques[0]):
for yi, y in enumerate(uniques[1]):
for zi, z in enumerate(uniques[2]):
coord_mapping[(x, y, z)] = (xi, yi, zi)
gridded_data = np.empty((len(uniques[0]), len(uniques[1]), len(uniques[2])))
for row in syn_normalized[:, ]:
coord = coord_mapping[tuple(row[:3])]
gridded_data[coord[0], coord[1], coord[2]] = row[-1]
dx = uniques[0][1]-uniques[0][0]
dy = uniques[1][1]-uniques[1][0]
dz = uniques[2][1]-uniques[2][0]
grad = np.gradient(gridded_data, dx, dy, dz)
def get_gradient_components(x, y, z):
u = grad[0][x, y, z]
v = grad[1][x, y, z]
w = grad[2][x, y, z]
return (u, v, w)
x, y, z = np.meshgrid(np.arange(1, 41, 4), np.arange(1, 81, 8), np.arange(1, 11))
fig = plt.figure(figsize=(10, 10))
ax = fig.gca(projection='3d')
ax.quiver(x, y, z, grad[0][1::8, 1::4, 1:], grad[1][1::8, 1::4, 1:], grad[2][1::8, 1::4, 1:], length=2, color="Tomato", alpha=.8,
arrow_length_ratio=.4)
ax.view_init(elev=18, azim=30)
ax.dist = 8
plt.show()
In [15]:
print "Avg. gradient: (%.4f, %.4f, %.4f)" % (np.average(grad[0]), np.average(grad[1]), np.average(grad[2]))
print "Std. dev per element: (%.4f, %.4f, %.4f)" % (np.std(grad[0]), np.std(grad[1]), np.std(grad[2]))
print grad[0]
In [15]:
from sklearn.decomposition import PCA
# center each variable and give it unit variance for PCA
def center(row, means, std_devs):
for idx, mean, std_dev in zip(range(4), means, std_devs):
row[idx] -= mean
row[idx] *= 1.0/std_dev
return row
syn_centered = np.apply_along_axis(center, 1, syn_normalized,
*zip(*[(np.average(syn_normalized[:, i]),
np.std(syn_normalized[:, i])) for i in range(4)]))
pca = PCA(n_components = 4)
transform = pca.fit_transform(syn_centered)
print pca.components_
print pca.explained_variance_ratio_
print transform.shape
# plot the clusters along the first 2 principal components
n_clusters = 4
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
predicted = gmm.fit_predict(syn_normalized)
plt.scatter(transform[:, 0], transform[:, 1], c=predicted, alpha=.3)
plt.show()
Observe fairly well defined boundary between clusters. Lets plot the 2D PCA when there are 12 clusters.
In [16]:
n_clusters = 12
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
predicted = gmm.fit_predict(syn_normalized)
plt.scatter(transform[:, 0], transform[:, 1], c=predicted, alpha=.3)
plt.show()
For larger clusters, boundaries remain pretty well defined, although they are not very apparent for small clusters.
In [ ]: