In [1]:
import numpy as np
import matplotlib.pyplot as plt
import urllib2
import sklearn.mixture as mixture
import sklearn.preprocessing as preprocess
%matplotlib inline
np.random.seed(1)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
'/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:]
In [2]:
# Independence Assumption
# since our data set is so large, taking the sample covariance matrix of the entire set and
# plotting a heat map would give little information, so instead we sample subsets of the data,
# obtain the covariance matrix, and then investigate the elementwise
# averaage of all the covariance matrices
sample_size = 100
total_samples = 1000
sample_covariances = np.empty((total_samples, sample_size, sample_size))
for i in range (total_samples):
# Randomly sample from dataset
a = np.random.permutation(np.arange(csv.shape[0]))[:sample_size]
csv_rand_sample = csv[a]
sample_covariances[i,:,:] = np.cov(csv_rand_sample)
# Average across random samples
covar = np.mean(sample_covariances, axis=0)
print covar.shape
plt.figure(figsize=(7,7))
plt.imshow(covar)
plt.title('Covariance of Synapse Density dataset')
plt.colorbar()
plt.show()
diag = covar.diagonal()*np.eye(covar.shape[0])
hollow = covar-diag
d_det = np.linalg.slogdet(diag)[1]
h_det = np.linalg.slogdet(hollow)[1]
print d_det
print h_det
plt.figure(figsize=(11,8))
plt.subplot(121)
plt.imshow(diag)
plt.clim([0, np.max(covar)])
plt.title('Log of determinant of on-diagonal: ' + str(d_det))
plt.subplot(122)
plt.imshow(hollow)
plt.clim([0, np.max(covar)])
plt.title('Log of determinant of off-diagonal: ' + str(h_det))
plt.show()
print "Ratio of the logs of on-diagonal over off-diagonal determinants: " + str(d_det/h_det)
From the above figures and ratio calculation, we see that the majority of variance is coming from the diagonal elements, meaning the data points are likely independent.
In [3]:
max_clusters = 15
bic = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = mixture.GMM(n_components=idx,n_iter=1000,covariance_type='diag')
gmm.fit(csv)
bic = np.append(bic, gmm.bic(csv))
print bic
plt.figure(figsize=(7,7))
plt.plot(i, 1.0/bic)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()
From the above we observe that, since the elbow of the bic curve lies at 4, our data is likely not identically distributed, meaning our initial assumption was false.
Assuming that unmasked is conditioned on position (x,y,z) and number of synapses at that position, our random forest regression performs extremely well with a R^2 value of 0.89 (+/- 0.01). The fact that this regression has a very high R^2 value for our dataset validates the assumption of a relationship between position, number of synapses, and unmasked value.