The clusterThresh section of our pipeline takes in a binary volume containing clusters, thresholds the clusters by volume, changes those cluster into our object type Cluster, then returns a list of these Clusters.
Inputs:
input: the input Volume
upperFence: the upper fence for volume thresholding
lowerFence: the lower fence for volume thresholding
Outputs: A list of objects of type Cluster
1. Find Connected Components
2. Convert the thresholded image to a sparse matrix
3. for each label between 1 and the number of objects post-thresholding (inclusive)
find the indices where the matrix equals that label
convert those indices to Cluster class
add this Cluster to the clusterList
4. Volume threshold
5. return the clusterList
Imports
In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys
sys.path.insert(0,'../code/functions/')
import tiffIO as tIO
import connectLib as cLib
import plosLib as pLib
import time
import scipy.ndimage as ndimage
import numpy as np
from scipy import sparse
from cluster import *
import mouseVis as mv
import plotly.graph_objs as go
import plotly.offline as py
import plotly
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook
The Actual Code:
In [1]:
def clusterThresh(volume, lowerFence=0, upperFence=250):
# the connectivity structure matrix
s = [[[1 for k in xrange(3)] for j in xrange(3)] for i in xrange(3)]
# find connected components
labeled, nr_objects = ndimage.label(volume, s)
#convert labeled to Sparse
sparseLabeledIm = np.empty(len(labeled), dtype=object)
for i in range(len(labeled)):
sparseLabeledIm[i] = sparse.csr_matrix(labeled[i])
clusterList = []
#converting to clusterList
for label in range(1, nr_objects + 1):
memberList = []
for z in range(len(sparseLabeledIm)):
memberListWithoutZ = np.argwhere(sparseLabeledIm[z] == label)
memberListWithZ = [[z] + list(tup) for tup in memberListWithoutZ]
memberList.extend(memberListWithZ)
#volume thresholding
if len(memberList) < upperFence and len(memberList) > lowerFence:
clusterList.append(Cluster(memberList))
return clusterList
I will be using the below cell for analyzing clusterThresh's Performance:
In [2]:
def clusterAnalysis(rawData, lowerFence = 20, upperFence = 250, sliceVis=5, bins=50):
clusterList = clusterThresh(rawData, lowerFence, upperFence)
volumeList = []
print "Number of clusters: " + str(len(clusterList))
displayIm = np.zeros_like(rawData)
for cluster in range(len(clusterList)):
volumeList.append(clusterList[cluster].getVolume())
for member in range(len(clusterList[cluster].members)):
z, y, x = clusterList[cluster].members[member]
displayIm[z][y][x] = 1
print "Average Volume: " + str(np.mean(volumeList))
shape = rawData.shape
print "Cluster Density: " + str(1.0*np.sum(volumeList)/(shape[0]*shape[1]*shape[2]))
plt.imshow(displayIm[sliceVis], cmap='gray')
plt.title('Slice at z=' + str(sliceVis))
plt.axis('off')
plt.show()
mv.generateVoxHist(volumeList, figName='Volume Distribution', bins=bins, axisStart=lowerFence, axisEnd=upperFence, xTitle='Volume', yTitle="Number of Clusters")
zRelationship = np.zeros(len(displayIm))
for z in range(len(displayIm)):
zRelationship[z] = np.mean(displayIm[z])
generatePlotlyLineForSliceDensities(zRelationship, figName="Z-Slice Densities")
yRelationship = np.zeros(len(displayIm[0]))
for y in range(len(displayIm[0])):
yRelationship[y] = np.mean(displayIm[:, y])
generatePlotlyLineForSliceDensities(yRelationship, figName="Y-Slice Densities")
xRelationship = np.zeros(len(displayIm[0][0]))
for x in range(len(displayIm[0][0])):
xRelationship[x] = np.mean(displayIm[:, :, x])
generatePlotlyLineForSliceDensities(xRelationship, figName="X-Slice Densities")
In [3]:
def generatePlotlyLineForSliceDensities(input, figName="untitled"):
x= [i for i in range(len(input))]
data = [go.Scatter(
y = input,
x = x,
mode = 'lines'
)]
layout = go.Layout(
title=figName,
xaxis=dict(
title='Slice'
),
yaxis=dict(
title='Density'
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
clusterThresh would work well under the conditions that the input volume contains seperable, non-overlapping, sparse clusters and that the input volume is in binary-form (i.e. the values of the background voxels are 0's and the value of the foreground voxels are all positive integers). clusterThresh would work poorly if the volume is not binary (i.e. the values of the background voxels are anything besides 0) or if the clusters are dense or in any way neighboring eachother.
In [5]:
import numpy as np
import matplotlib.pyplot as plt
clusterGrid = np.zeros((100, 1000, 1000))
for i in range(40):
for j in range(40):
for k in range(40):
clusterGrid[20*(2*j): 20*(2*j + 1), 20*(2*i): 20*(2*i + 1), 20*(2*k): 20*(2*k + 1)] = 1
plt.imshow(clusterGrid[5])
plt.axis('off')
plt.title('Slice at z=5')
plt.show()
Prediction: I predict that this volume will be perfectly segmented into 1875 clusters.
The Difficult Data Set:
Description: The good data set is a 1000 x 1000 x 100 volume containing 1875 clusters of size 125 with value of 2. Every other value in the volume is 1. In other words, the image is not binarized.
Plot: I will plot the data at z=5 because it provides better visualization.
In [2]:
clusterGrid = clusterGrid + 1
plt.imshow(clusterGrid[5])
plt.axis('off')
plt.title('Slice at z=5')
plt.show()
In [5]:
simEasyGrid = np.zeros((100, 100, 100))
for i in range(4):
for j in range(4):
for k in range(4):
simEasyGrid[20*(2*j): 20*(2*j + 1), 20*(2*i): 20*(2*i + 1), 20*(2*k): 20*(2*k + 1)] = 1
Predicting what good data will look like: I believe the good data will look like a grid of 27 cubes, 9 in each slice that contains clusters.
In [6]:
plt.imshow(simEasyGrid[5])
plt.axis('off')
plt.show()
Visualization relative to prediction: As predicted, the good data looks like a grid of cubes, 9 in each slice that contains clusters.
The Difficult Data Set:
In [7]:
simDiffGrid = simEasyGrid + 1
Predicting what difficult data will look like: I believe the good data will look like a grid of 27 cubes, 9 in each slice that contains clusters.
In [8]:
plt.imshow(simDiffGrid[5])
plt.axis('off')
plt.show()
In [12]:
clusterAnalysis(simEasyGrid, lowerFence=0, upperFence=10000, sliceVis = 41, bins=100)
Results of Good Data Relative to Predictions: As expected, the volume was segmented into 27 seperate clusters very quickly.
Repeating the Good Data Simulation:
In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
labeledNumClusters = []
volumesList = []
times = []
for i in range(10):
start_time = time.time()
clusterList = clusterThresh(simEasyGrid, lowerFence = 0, upperFence = 100000)
times.append((time.time() - start_time))
labeledNumClusters.append(len(clusterList))
volumes = []
for i in range(len(clusterList)):
volumes.append(clusterList[i].getVolume())
volumesList.append(np.mean(volumes))
pylab.hist(labeledNumClusters, normed=1)
pylab.xlabel('Number of Clusters')
pylab.ylabel('Number of Trials')
pylab.show()
print 'Average Number of Components on Easy Simulation Data:\n\tExpected: 27\tActual: ' + str(np.mean(labeledNumClusters))
pylab.hist(volumesList, normed=1)
pylab.xlabel('Average Volume')
pylab.ylabel('Number of Trials')
pylab.show()
print 'Average Volume on Easy Simulation Data:\n\tExpected: 8000\tActual: ' + str(np.mean(volumesList))
pylab.hist(times, normed=1)
pylab.xlabel('Time Taken to Execute')
pylab.ylabel('Number of Trials')
plt.show()
print 'Average Time Taken to Execute: ' + str(np.mean(times))
Difficult Data Prediction: I predict the difficult data will be segmented into 1 big cluster.
In [15]:
clusterAnalysis(simDiffGrid, lowerFence = 0, upperFence=1000000, sliceVis = 41)
Results of Difficult Data Result Relative to Prediction: As expected, the volume was segmented into one big component.
Repeating the Difficult Data Simulation:
In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
labeledNumClusters = []
volumesList = []
times = []
for i in range(10):
start_time = time.time()
clusterList = clusterThresh(simDiffGrid, lowerFence = 0, upperFence = 8000)
times.append((time.time() - start_time))
labeledNumClusters.append(len(clusterList))
volumes = []
for i in range(len(clusterList)):
volumes.append(clusterList[i].getVolume())
volumesList.append(np.mean(volumes))
pylab.hist(labeledNumClusters)
pylab.xlabel('Number of Clusters')
pylab.ylabel('Number of Trials')
pylab.show()
print 'Average Number of Components on Difficult Simulation Data:\n\tExpected: 0\tActual: ' + str(np.mean(labeledNumClusters))
pylab.hist(volumesList, normed=1)
pylab.xlabel('Average Volume')
pylab.ylabel('Number of Trials')
pylab.show()
print 'Average Volume on Difficult Simulation Data:\n\tExpected: 8000\tActual: ' + str(np.mean(volumesList))
pylab.hist(times, normed=1)
pylab.xlabel('Time Taken to Execute')
pylab.ylabel('Number of Trials')
plt.show()
print 'Average Time Taken to Execute: ' + str(np.mean(times))
Summary of Performances: Connected Components performed extremely well on the easy simulation, correctly detecting 27 components very quickly for every trial. It also performed poorly as expected on the difficult simulation, connecting 1 component for every trial
Description: Validation testing will be performed on a a 100x100x100 volume with a pixel intensity distribution approximately the same as that of the true image volumes (i.e., 98% background, 2% synapse). The synapse pixels will be grouped together in clusters as they would in the true data. Based on research into the true size of synapses, these synthetic synapse clusters will be given area of ~1 micron ^3, or about 139 voxels (assuming the synthetic data here and the real world data have identical resolutions). After the data goes through the algorithm, I will gauge performance based on the following: number of clusters (should be about 500) volumetric density of data (should be about 2% of the data)
Plotting Raw Synthetic Data:
In [17]:
from random import randrange as rand
from mpl_toolkits.mplot3d import axes3d, Axes3D
def generatePointSet():
center = (rand(0, 99), rand(0, 99), rand(0, 99))
toPopulate = []
for z in range(-1, 5):
for y in range(-1, 5):
for x in range(-1, 5):
curPoint = (center[0]+z, center[1]+y, center[2]+x)
#only populate valid points
valid = True
for dim in range(3):
if curPoint[dim] < 0 or curPoint[dim] >= 100:
valid = False
if valid:
toPopulate.append(curPoint)
return set(toPopulate)
def generateTestVolume():
#create a test volume
volume = np.zeros((100, 100, 100))
myPointSet = set()
for _ in range(rand(500, 800)):
potentialPointSet = generatePointSet()
#be sure there is no overlap
while len(myPointSet.intersection(potentialPointSet)) > 0:
potentialPointSet = generatePointSet()
for elem in potentialPointSet:
myPointSet.add(elem)
#populate the true volume
for elem in myPointSet:
volume[elem[0], elem[1], elem[2]] = 60000
#introduce noise
noiseVolume = np.copy(volume)
for z in range(noiseVolume.shape[0]):
for y in range(noiseVolume.shape[1]):
for x in range(noiseVolume.shape[2]):
if not (z, y, x) in myPointSet:
noiseVolume[z][y][x] = rand(0, 10000)
return volume
foreground = generateTestVolume()
#displaying the random clusters
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
z, y, x = foreground.nonzero()
ax.scatter(x, y, z, zdir='z', c='r')
plt.title('Random Foreground Clusters')
plt.show()
Expectation for Synthetic Data: I expect that the Connected Components will detect around 500 clusters.
Running Algorithm on Synethetic Data:
In [18]:
clusterAnalysis(foreground, lowerFence = 0, upperFence = 1000, sliceVis = 0, bins=20)
In [19]:
clusterAnalysis(foreground, lowerFence = 40, upperFence = 500, sliceVis = 0, bins=20)
I will first be testing to ensure that the Cluster Components functions on a slice of 5, and measuring the time it takes to ensure it is fast. I will also test the VolumeThreshold function, reporting the average volumes before and after the thresholding takes place, and plotting a slice to ensure that the volume threshold function is working properly. I will then repeat such tests on the entire volume.
In [25]:
dataSubset = tIO.unzipChannels(tIO.loadTiff('../data/SEP-GluA1-KI_tp1.tif'))[0][125:130]
plt.imshow(dataSubset[0], cmap="gray")
plt.axis('off')
plt.title('Raw Data Slice at z=0')
plt.show()
In [26]:
%%time
postAdaptive = cLib.adaptiveThreshold(dataSubset, 64, 64)
In [27]:
%%time
postKnn = cLib.knn_filter(postAdaptive, 2)
In [28]:
plt.imshow(postKnn[0])
plt.show()
In [29]:
%%time
clusterAnalysis(postKnn, lowerFence = 10, upperFence = 180, sliceVis = 0, bins = 100)