In [52]:
# Import packages
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm as CM
from matplotlib import mlab as ML
import pandas as pd
import numpy as np
import itertools
from scipy import stats
from scipy.stats import poisson
from scipy.stats import lognorm
from decimal import *
# Read in data
df = pd.read_csv('../output.csv')
In [53]:
import seaborn as sns
fs = 18
lfs = 14
x = df['synapses']
y = df['unmasked']/1e4
# with sns.plotting_context('notebook', font_scale=1.5), :
with sns.axes_style('white'):
g = sns.jointplot(x=x, y=y, kind='hex', color='r',
xlim=(x.min()-0.02*np.ptp(x),x.max()+0.02*np.ptp(x)),
ylim=(y.min()-0.02*np.ptp(y),y.max()+0.02*np.ptp(y)),
joint_kws={'gridsize':40}, marginal_kws={'bins':40}, stat_func=None);
plt.gcf().axes[1].set_title("JointPlot of Synapse Count vs. Unmasked Count", fontsize = fs)
plt.gcf().axes[0].set_xlabel("Synapses Count", fontsize = fs)
plt.gcf().axes[0].set_ylabel("Unmasked Count (1e4)", fontsize=fs)
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/jointPlotSynUm.png', format='png', dpi=300)
plt.show()
Weight each bin by the fraction of unmasked voxels (64x64x48 voxels/bin):
In [54]:
nvox = 64*64*48
df['weighted'] = df['synapses']/df['unmasked']*nvox
syn_wt = df['weighted']
print "There are", df['weighted'].isnull().sum(), "blank bins"
plt.hist(syn_wt, bins = 40, range = (syn_wt.min(), syn_wt.max()),
color = 'pink');
plt.xlabel('Weighted number of synapses');
plt.ylabel('Count');
Only keep bins with greater than 50% unmasked data:
In [55]:
fs = 18
lfs = 14
dfthr = df[df['unmasked']>nvox*0.5] # Thresholded data frame
syn_wt_thresh = dfthr['weighted']
plt.figure()
plt.hist(syn_wt_thresh, bins = 40, range = (syn_wt_thresh.min(), 500),
color = 'pink');
plt.title('Distribution of Weighted Synapses per bin', fontsize = fs)
plt.xlabel('Weighted number of synapses for thresholded bins',fontsize = fs);
plt.ylabel('Count',fontsize = fs);
plt.tick_params(axis='both', which='major', labelsize=lfs)
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/histW.png', format='png', dpi=300)
plt.show()
Look at different Z-slices:
In [56]:
byZ = pd.pivot_table(dfthr, index=['cx','cy'], columns='cz', values='weighted', aggfunc=np.sum);
byZ.hist(bins=40, figsize=(12,12));
$\chi^2 = \sum{\frac{(Observed-Expected)^2}{Expected}}$
In [57]:
alpha = 0.05
S = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100, 200, 300, 400, 500, 1000, 2000] # sample size
ite = 100
pow_null = []
for j in S:
m = 10 # bin number
mu = 3 # lambda of Poisson dist
n = j # number of sample (sample size)
p_value = []
boundary = np.linspace(0, 10, (m + 1))
cumu_prob = poisson.cdf(boundary, mu = mu)
exp_prob = []
for i in range(0, m):
if i == 0:
exp_prob = np.append(exp_prob, cumu_prob[i])
if i > 0:
exp_prob = np.append(exp_prob, cumu_prob[i] - cumu_prob[i - 1])
if i == (m - 1):
exp_prob = np.append(exp_prob, 1 - sum(exp_prob))
exp_freq = exp_prob * n
expected_values = exp_freq
for iteration in range(0, ite):
sample = np.random.poisson(mu, n)
bin_content = []
for i in range(0, m + 1):
bin_content = np.append(bin_content, sum(sample <= boundary[i]))
obe_freq = []
for i in range(0, m + 1):
if i == 0:
obe_freq = np.append(obe_freq, bin_content[i])
if i > 0:
obe_freq = np.append(obe_freq, bin_content[i] - bin_content[i - 1])
observed_values = obe_freq
test_value = stats.chisquare(observed_values,
f_exp= expected_values)[1]
p_value = np.append(p_value, test_value)
pow_null = np.append(pow_null, (Decimal(sum(p_value < alpha))) / Decimal(len(range(0, ite))))
In [58]:
alpha = 0.05
S = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100, 200, 300, 400, 500, 1000, 2000] # sample size
ite = 100
pow_alt = []
for j in S:
m = 10 # number of bins
mu = 3 # lambda of Poisson dist
n = j # number of sample (sample size)
p_value = []
boundary = np.linspace(0, 10, (m + 1))
cumu_prob = poisson.cdf(boundary, mu = mu)
exp_prob = []
for i in range(0, m):
if i == 0:
exp_prob = np.append(exp_prob, cumu_prob[i])
if i > 0:
exp_prob = np.append(exp_prob, cumu_prob[i] - cumu_prob[i - 1])
if i == (m - 1):
exp_prob = np.append(exp_prob, 1 - sum(exp_prob))
exp_freq = exp_prob * n
for iteration in range(0, ite):
sample = np.random.geometric(p = 0.5, size = n)
bin_content = []
for i in range(0, m + 1):
bin_content = np.append(bin_content, sum(sample <= boundary[i]))
obe_freq = []
for i in range(0, m + 1):
if i == 0:
obe_freq = np.append(obe_freq, bin_content[i])
if i > 0:
obe_freq = np.append(obe_freq, bin_content[i] - bin_content[i - 1])
observed_values = obe_freq
expected_values = exp_freq
test_value = stats.chisquare(observed_values,
f_exp= expected_values)[1]
p_value = np.append(p_value, test_value)
pow_alt = np.append(pow_alt, (Decimal(sum(p_value < alpha))) / Decimal(len(range(0, ite))))
In [59]:
plt.figure()
plt.scatter(S, pow_null, hold = True, label = "Null model")
plt.scatter(S, pow_alt, color = "green", hold=True, label = "Alt model")
plt.xscale("log")
plt.xlabel("Number of samples",fontsize = fs)
plt.ylabel("Power",fontsize = fs)
plt.title("Power of Poisson Distribution test",fontsize = fs)
plt.tick_params(axis='both', which='major', labelsize=lfs)
plt.axhline(alpha, color = "red", linestyle = '--', label = "alpha")
plt.legend(loc = 5)
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/poissSim.png', format='png', dpi=300)
plt.show()
histogram the weighted, thresholded synapse counts. The stepSize of 10 was chosen manually after observing different stepSizes. To run a chi-Squared test, each bin should hold at least 5 events. stepSize of 10 was roughly the smallest step that yielded at least 5 events in the "middle" of the distribution.
In [60]:
stepSize = 10
boundary = np.arange(0,np.ceil(syn_wt_thresh.max()/stepSize)*stepSize,stepSize);
(observedBin,binEdges,dummy) = plt.hist(syn_wt_thresh, bins = boundary, color = 'pink');
print binEdges
print np.linspace(0, 9, 10)
The observedBin counts include bins at the "tails" with less than 5 events. We summed the "tail" bins together to yield aggregate bins with >= 5 events. Also modified binEdges accordingly and use this to calculate expectedBin
In [61]:
# print np.where(observedBin<5)
# print "based on indicies where observedBin>5, we will hardcode bins 0:1 and bins 47:end"
stepSize = 10
boundary = np.arange(0,np.ceil(syn_wt_thresh.max()/stepSize)*stepSize,stepSize);
(observedBin,binEdges,dummy) = plt.hist(syn_wt_thresh, bins = boundary, color = 'pink');
cumObs = np.cumsum(observedBin)
cumObsRev = np.cumsum(observedBin[::-1])[::-1]
firstIdx = 2
lastIdx = 48
observedBinNoTail = np.concatenate((cumObs[firstIdx:firstIdx+1],observedBin[firstIdx+1:lastIdx],cumObsRev[lastIdx:lastIdx+1]))
# obsBinCent = np.linspace()
# print observedBinNoTail
# print binEdges
binEdges = np.concatenate((binEdges[0:1],binEdges[firstIdx+1:lastIdx+1],np.array([np.inf])))
# print binEdges
binPlot = np.append(np.diff(binEdges[0:len(binEdges)-1]),stepSize)
# print binPlot
binPlot = binPlot/2 + binEdges[0:len(binEdges)-1]
print binPlot
# print binPlot
N = np.size(syn_wt_thresh)
print "Number of samples: ", N
muObs = np.mean(syn_wt_thresh)
#print muObs
cumu_prob = poisson.cdf(binEdges, mu = muObs)
# print cumu_prob
expectedBin = N*np.diff(cumu_prob)
print expectedBin
Plot observedBinNoTail and expectedBin. Run the Chi-squared test. obtain p-value
In [62]:
plt.figure()
plt.scatter(binPlot, observedBinNoTail/1e3, hold = False, label = "Observed")
plt.scatter(binPlot, expectedBin/1e3, color = "green", hold=True, label = "Expected")
plt.legend(loc = 5)
plt.title("Observed and Expected events",fontsize = fs)
plt.xlabel("Weighted Synapse Density",fontsize = fs)
plt.ylabel("Count (1e3)",fontsize = fs)
plt.tick_params(axis='both', which='major', labelsize=lfs)
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/chiObsExp.png', format='png', dpi=300)
plt.show()
#print np.square(np.array([1,2]))
#print np.square(observedBin-expectedBin)/expectedBin
#chi2 = np.sum(np.square(observedBin-expectedBin)/expectedBin)
#print chi2
(chi2,p) = stats.chisquare(observedBinNoTail,f_exp=expectedBin,ddof=1)
print "chi-squared statistic: ",chi2
print "p-value: ", p
The chi-squared statistic is virtually infinite, which leads to a p-value close to 0, which rejects the null hypothesis. The number of samples is on the order of 10^4. According to Step 5, this number of samples should yield a power of 1.0 if the true distribution was a geometric distribution. Graphically, it seems fairly clear that the distribution is indeed not Poisson. The immediate problem is that the Poisson distribution has only one parameter (the mean), and does not have a second parameter to account for spread. We matched the expected mean with the observed mean, but the spread of the expected vs. spread of the observed distributions are clearly different. We should retry this with a Gaussian or log-normal distribution. Will do this in the future.