We then randomly choose a small grid, and use the $X$ and $Y$ position to predict this grid belongs to high or low density.
$N$ is the number of synapses (observations)
Classification methods:
In [1]:
# Import packages
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.stats import norm
import seaborn as sns
from sklearn import cross_validation
from sklearn.cross_validation import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
np.random.seed(12345678) # for reproducibility, set random seed
In [2]:
names = ["Nearest Neighbors", "Linear SVM", "Random Forest",
"Linear Discriminant Analysis", "Quadratic Discriminant Analysis"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.001), # decreased C to improve computation time
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
allSampleSize = np.array((16, 20, 32, 40, 64, 80, 100, 120, 200, 320, 400, 800, 1000))
gridSize = 25 #for a 5x5 grid: 25 total spaces
nullMean = 200
nullStd = 40 * np.sqrt(gridSize)
altMean = 250
altStd = nullStd
trainNullPrior = .5
accuracyNull=np.zeros((len(allSampleSize),len(classifiers),2))
accuracyAlt = np.zeros((len(allSampleSize),len(classifiers),2))
for idxSize,size in enumerate(allSampleSize):
labels = np.random.binomial(1,trainNullPrior,size)
trainingNullSample = np.mean(np.random.normal(nullMean,nullStd,[np.sum(labels==0),gridSize]),axis=1)
trainingAltSample = np.mean(np.random.normal(altMean ,altStd ,[np.sum(labels==1),gridSize]),axis=1)
trainingAltNullSample=np.mean(np.random.normal(nullMean,altStd ,[np.sum(labels==1),gridSize]),axis=1)
featureNull = np.zeros(size)
featureAlt = np.zeros(size)
featureNull[labels==0] = trainingNullSample
featureAlt[labels==0] = trainingNullSample
featureNull[labels==1] = trainingAltNullSample
featureAlt[labels==1] = trainingAltSample
featureNull = featureNull.reshape(-1,1)
featureAlt = featureAlt.reshape(-1,1)
loo = LeaveOneOut(size)
for idxCla, cla in enumerate(classifiers):
scoresNull = cross_validation.cross_val_score(cla, featureNull, labels, cv=loo)
print("Accuracy of %s with %s: %0.2f (+/- %0.2f)" % (names[idxCla], 'Null', scoresNull.mean(), ss.sem(scoresNull)))
scoresAlt = cross_validation.cross_val_score(cla, featureAlt , labels, cv=loo)
print("Accuracy of %s with %s: %0.2f (+/- %0.2f)" % (names[idxCla], 'Alt', scoresAlt.mean(), ss.sem(scoresAlt)))
accuracyNull[idxSize,idxCla,] = [scoresNull.mean(), ss.sem(scoresNull)]
accuracyAlt[idxSize,idxCla,] = [scoresAlt.mean() , ss.sem(scoresAlt)]
In [18]:
#Plot
plt.figure()
plt.errorbar(allSampleSize, accuracyAlt[:,0,0], yerr = accuracyAlt[:,0,1], hold=True, label=names[0])
plt.errorbar(allSampleSize, accuracyAlt[:,1,0], yerr = accuracyAlt[:,1,1], color='green', hold=True, label=names[1])
plt.errorbar(allSampleSize, accuracyAlt[:,2,0], yerr = accuracyAlt[:,2,1], color='red', hold=True, label=names[2])
plt.errorbar(allSampleSize, accuracyAlt[:,3,0], yerr = accuracyAlt[:,3,1], color='black', hold=True, label=names[3])
plt.errorbar(allSampleSize, accuracyAlt[:,4,0], yerr = accuracyAlt[:,4,1], color='brown', hold=True, label=names[4])
plt.xscale('log')
plt.xlabel('number of samples')
plt.ylabel('accuracy')
plt.title('Accuracy vs. sample size under Alt Hypothesis')
plt.axhline(1, color='red', linestyle='--')
plt.legend(loc='lower right')
plt.xlim([10,1500])
plt.show()
plt.figure()
plt.errorbar(allSampleSize, accuracyNull[:,0,0], yerr = accuracyNull[:,0,1], hold=True, label=names[0])
plt.errorbar(allSampleSize, accuracyNull[:,1,0], yerr = accuracyNull[:,1,1], color='green', hold=True, label=names[1])
plt.errorbar(allSampleSize, accuracyNull[:,2,0], yerr = accuracyNull[:,2,1], color='red', hold=True, label=names[2])
plt.errorbar(allSampleSize, accuracyNull[:,3,0], yerr = accuracyNull[:,3,1], color='black', hold=True, label=names[3])
plt.errorbar(allSampleSize, accuracyNull[:,4,0], yerr = accuracyNull[:,4,1], color='brown', hold=True, label=names[4])
plt.xscale('log')
plt.xlabel('number of samples')
plt.ylabel('accuracy')
plt.title('Accuracy vs. sample size under Null Hypothesis')
plt.axhline(1, color='red', linestyle='--')
plt.legend(loc='lower right')
plt.xlim([10,1500])
plt.show()
In [4]:
# Read in data
df = pd.read_csv('../output.csv')
nvox = 64*64*48 # assume number of voxels per bin
df['weighted'] = df['synapses']/df['unmasked']*nvox
xvals = df['cx'].unique()
yvals = df['cy'].unique()
zvals = df['cz'].unique()
# print xvals
# print yvals
# print zvals
plt.hist(df['cx'], weights=df['unmasked']/(nvox*len(yvals)*len(zvals)),
bins=len(xvals), edgecolor='none')
plt.xlabel('X coordinate')
plt.ylabel('Fraction unmasked voxels')
plt.title('Original data')
plt.show()
plt.hist(df['cy'], weights=df['unmasked']/(nvox*len(xvals)*len(zvals)),
bins=len(yvals), edgecolor='none')
plt.xlabel('Y coordinate')
plt.ylabel('Fraction unmasked voxels')
plt.title('Original data')
plt.show()
In [5]:
# Get rid of the blank edges, Z-layer by Z-layer.
df['edge'] = 0
for z in zvals:
this_z = df[df['cz']==z]
# X direction
xhist, bin_edges = np.histogram(this_z['cx'], weights = this_z['unmasked']/(nvox*len(yvals)), bins=len(xvals))
left = np.argmax(xhist>0.5)
right = len(xvals)-np.argmax(xhist[::-1]>0.5)
df.loc[(df['cz']==z) & ((df['cx']<xvals[left]) | (df['cx']>=xvals[right])),'edge'] = 1
# Y direction
yhist, bin_edges = np.histogram(this_z['cy'], weights = this_z['unmasked']/(nvox*len(xvals)), bins=len(yvals))
top = np.argmax(yhist>0.5)
bottom = len(yvals)-np.argmax(yhist[::-1]>0.5)
df.loc[(df['cz']==z) & ((df['cy']<yvals[top]) | (df['cy']>=yvals[bottom])),'edge'] = 1
print 'We labeled',np.sum(df['edge']==1),'bins as edges.'
In [6]:
# Example: removing edges along X dimension for one Z-plane
plt.figure()
plt.bar(xvals, xhist, width=40, edgecolor='none')
plt.title('Original data')
plt.xlabel('X coordinate')
plt.ylabel('Fraction unmasked voxels')
plt.show()
plt.figure()
plt.bar(xvals[left:right], xhist[left:right], width=40, edgecolor='none')
plt.title('Data with edges removed')
plt.xlabel('X coordinate')
plt.ylabel('Fraction unmasked voxels')
plt.show()
In [7]:
# Example: removing edges along Y dimension for one Z-plane
plt.figure()
plt.bar(yvals, yhist, width=40, edgecolor='none')
plt.title('Original data')
plt.xlabel('Y coordinate')
plt.ylabel('Fraction unmasked voxels')
plt.show()
plt.figure()
plt.bar(yvals[top:bottom], yhist[top:bottom], width=40, edgecolor='none')
plt.title('Data with edges removed')
plt.xlabel('Y coordinate')
plt.ylabel('Fraction unmasked voxels')
plt.show()
In [8]:
# Example Z slice before and after removing edges
Zslice = df[df['cz']==z].pivot_table(index='cy', columns='cx', values=['unmasked','edge'], aggfunc=np.sum)
plt.figure()
sns.heatmap(Zslice['unmasked'], xticklabels=20, yticklabels=10,
cbar_kws={'label': 'Unmasked voxels'});
plt.title('Example Z slice');
plt.figure()
sns.heatmap(Zslice[Zslice['edge']==0]['unmasked'], xticklabels=20, yticklabels=10,
cbar_kws={'label': 'Unmasked voxels'});
plt.title('Example Z slice with edges removed');
In [9]:
# Copy new dataset without edges
df2 = df[df['edge']==0].copy()
df2 = df2.drop('edge', axis=1)
print df2.head()
In [10]:
import pickle
# Saving random state so that we can generate the same grids later
rnd_state = np.random.get_state()
with open('random_state.pickle', 'w') as f:
pickle.dump(rnd_state, f)
# Generating dataset of 5x5 grids
grids = []
grid_x = []
grid_y = []
for z in zvals:
# new xvals and yvals vary by Z layer
nxvals = df2[df2['cz']==z]['cx'].unique()
nyvals = df2[df2['cz']==z]['cy'].unique()
for i in range(100):
xi = np.random.randint(2,len(nxvals)-2)
yi = np.random.randint(2,len(nyvals)-2)
newgrid = df[
(df['cx']>=nxvals[xi-2]) & (df['cx']<=nxvals[xi+2]) &
(df['cy']>=nyvals[yi-2]) & (df['cy']<=nyvals[yi+2]) &
(df['cz'] == z)
]
grids.append(newgrid)
grid_x.append(nxvals[xi])
grid_y.append(nyvals[yi])
# plot examples for one Z layer
if z==55 and i<3:
exampleXY = pd.pivot_table(grids[i], index='cy', columns='cx', values='weighted', aggfunc=np.sum)
plt.figure()
sns.heatmap(exampleXY, cbar_kws={'label': 'Weighted number of synapses'});
plt.title('Example X-Y grid');
In [11]:
# Mean density by Z layer
grid_means = np.array([g['weighted'].mean() for g in grids])
grid_z = np.array([g['cz'].mean() for g in grids])
z_means = np.array([df2[df2['cz']==z]['weighted'].mean() for z in zvals])
plt.figure()
plt.boxplot([grid_means[grid_z==z] for z in zvals], vert=False, positions=zvals, widths=60)
Z_pts = plt.scatter(z_means, zvals, label='entire Z layer', s=60, color='r', marker='x', linewidths=2)
plt.ylim(0,1200)
plt.xlabel('Synapse Density')
plt.ylabel('Z layer')
plt.title('Distribution of 5x5 grid synapse densities')
plt.legend()
plt.show()
In [12]:
# Mean density by Z mean
grid_z_mean = np.array([df2[df2['cz']==g['cz'].mean()]['weighted'].mean() for g in grids])
plt.figure()
grid_pts = plt.scatter(grid_means, grid_z_mean, label='5x5 grid')
plt.xlabel('Synapse Density')
plt.ylabel('Z mean')
plt.show()
In [13]:
# Split into high density vs. low density layers
from sklearn.cluster import KMeans
est = KMeans(n_clusters=2)
est.fit(z_means.reshape(len(z_means),1))
labels = est.labels_
# Save grid data
with open('Z_labels.pickle', 'w') as f:
pickle.dump([zvals, labels], f)
plt.figure()
pts0 = plt.scatter(z_means[labels==0], zvals[labels==0], s=60, facecolor='steelblue')
pts1 = plt.scatter(z_means[labels==1], zvals[labels==1], s=60, facecolor='darkseagreen')
pts0.set_label('Group 0')
pts1.set_label('Group 1')
plt.xlabel('Synapse Density')
plt.ylabel('Z layer')
plt.title('K-Means clustering into two groups')
plt.legend()
plt.show()
In [14]:
# Plot distributions of our dataset
grid_labels = np.array([ labels[ np.argmax( zvals==g['cz'].mean() ) ] for g in grids ])
g0 = grid_means[np.where(grid_labels==0)]
g1 = grid_means[np.where(grid_labels==1)]
n0 = len(g0)
n1 = len(g1)
p0 = float(n0)/(n0+n1)
p1 = float(n1)/(n0+n1)
plt.figure()
ax = sns.distplot(g0, kde=False, fit=norm, label='Group 0', hist_kws={'edgecolor': 'none'})
sns.distplot(g1, kde=False, fit=norm, label='Group 1', hist_kws={'edgecolor': 'none'})
# weight fit by priors
fit0 = (ax.lines[0].get_data()[0], p0*ax.lines[0].get_data()[1])
ax.lines[0].set_data(fit0)
fit1 = (ax.lines[1].get_data()[0], p1*ax.lines[1].get_data()[1])
ax.lines[1].set_data(fit1)
# weight histograms by priors
scale = -1
for r in ax.patches:
if r.get_label() == 'Group 0':
scale = p0
elif r.get_label() == 'Group 1':
scale = p1
if scale != -1:
h = r.get_height()
r.set_height(scale*h)
ax.set_ylim(0,0.009)
plt.xlabel('Synapse Density')
plt.ylabel('Probability')
plt.title('PDF')
plt.legend()
plt.show()
In [15]:
# Run classification algorithms on dataset
names = ["Nearest Neighbors", "Linear SVM", "Random Forest",
"Linear Discriminant Analysis", "Quadratic Discriminant Analysis"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.001), # decreased C to improve computation time
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
features = grid_means.reshape(-1,1)
y = grid_labels
accuracy=np.zeros((len(classifiers),2))
for idx, cla in enumerate(classifiers):
loo = LeaveOneOut(len(features))
scores = cross_validation.cross_val_score(cla, features, y, cv=loo)
accuracy[idx,] = [scores.mean(), ss.sem(scores)]
print("Accuracy of %s: %0.2f (+/- %0.2f)" % (names[idx], scores.mean(), ss.sem(scores)))
The five classifiers tested had accuracies between 74-78%, which is better than chance. However, these numbers are only slightly at or above the maximum prior probability of 73%. This means that our classifiers are only slightly better than just choosing the class with the maximum prior 100% of the time, assuming we trust the priors. Taking into account the observed synapse density provides little added information, which is not surprising given the large overlap in observed densities from the two classes. If we wanted to distinguish between similar populations of Z layers but with different priors from another dataset, our accuracy would decrease accordingly.
In [16]:
# Save grid data
with open('grid_data.pickle', 'w') as f:
pickle.dump([grid_means, grid_x, grid_y, grid_z], f)
In [ ]: