preliminary setup


In [28]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import urllib2

np.random.seed(1)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
       '/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)

# chopping data based on thresholds on x and y coordinates
x_bounds = (409, 3529)
y_bounds = (1564, 3124)

def check_in_bounds(row, x_bounds, y_bounds):
    if row[0] < x_bounds[0] or row[0] > x_bounds[1]:
        return False
    if row[1] < y_bounds[0] or row[1] > y_bounds[1]:
        return False
    if row[3] == 0:
        return False

    return True

indices_in_bound, = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_thresholded = csv[indices_in_bound]
n = data_thresholded.shape[0]
true_data = data_thresholded

1) Write code to generate simulated data using a probability distribution model of our data.

The model will be as follows: for each block of space (that is row of data) the number of synapses follows a binomial distribution with parameters p=synapses/unmasked and n=unmasked, so data generated by this model will have the same set of coordinates and unmasked values as the true data but number of synapses will differ. Since the number of 'trials' (unmasked voxels) is so high, it may be necessary to approximate the binomial distributions with Gaussians or Poissons, since I'd think that computing binomial random variables would be much more computationally expensive than Gaussian or Poisson random variates. Furthermore, the Poisson distribution seems especially fitting here since all probabilities are extremely small (even the largest being less than 1%.)


In [2]:
# simulates data under a model derived from true_data
# made function flexible with different distributions
# since binomial, poisson, and gaussian are all plausible
def run_simulation(true_data, distrib, **kwargs):
    sim_data = np.copy(true_data)
    kwargs['size'] = true_data[:,4].shape
    new_synapse_vals = distrib.rvs(**kwargs)
    sim_data[:, 4] = new_synapse_vals
    return sim_data

# to make sure function works and that poisson is reasonable...
from scipy.stats import poisson

# lambda = np when approximating a binomial
# E[Bin(n, p)] = np, where n=unmasked voxels, and p is the probability
lambdas = true_data[:, 4] 

sim_data = run_simulation(true_data, poisson, mu=lambdas)
print sim_data
print true_data
print np.average(sim_data[:, 4]), np.average(true_data[:, 4])
print np.median(sim_data[:, 4]), np.median(true_data[:, 4])
print np.max(sim_data[:, 4]), np.max(true_data[:, 4])


[[  4.09000000e+02   1.56400000e+03   5.50000000e+01   1.33574000e+05
    2.01000000e+02]
 [  4.09000000e+02   1.56400000e+03   1.66000000e+02   1.39751000e+05
    2.06000000e+02]
 [  4.09000000e+02   1.56400000e+03   2.77000000e+02   1.47200000e+05
    2.23000000e+02]
 ..., 
 [  3.52900000e+03   3.12400000e+03   9.43000000e+02   1.30854000e+05
    1.18000000e+02]
 [  3.52900000e+03   3.12400000e+03   1.05400000e+03   1.49887000e+05
    8.80000000e+01]
 [  3.52900000e+03   3.12400000e+03   1.16500000e+03   1.41682000e+05
    1.42000000e+02]]
[[  4.09000000e+02   1.56400000e+03   5.50000000e+01   1.33574000e+05
    2.04000000e+02]
 [  4.09000000e+02   1.56400000e+03   1.66000000e+02   1.39751000e+05
    2.24000000e+02]
 [  4.09000000e+02   1.56400000e+03   2.77000000e+02   1.47200000e+05
    2.39000000e+02]
 ..., 
 [  3.52900000e+03   3.12400000e+03   9.43000000e+02   1.30854000e+05
    1.21000000e+02]
 [  3.52900000e+03   3.12400000e+03   1.05400000e+03   1.49887000e+05
    9.00000000e+01]
 [  3.52900000e+03   3.12400000e+03   1.16500000e+03   1.41682000e+05
    1.33000000e+02]]
163.243145743 163.250693751
173.0 174.0
526.0 507.0

As expected, Poisson simulated data seems to be pretty accurate. Out of curiosity, lets try with gaussians and binomials


In [3]:
from scipy.stats import norm, binom

n = true_data[:, 3]
p = np.apply_along_axis(lambda row : row[4]/row[3], 1, true_data)
binom_var = lambdas*(np.ones(p.shape)-p)
sim_data = run_simulation(true_data, norm, loc=lambdas, scale=np.sqrt(binom_var))
print sim_data
print np.average(sim_data[:, 4]), np.average(true_data[:, 4])
print np.median(sim_data[:, 4]), np.median(true_data[:, 4])
print np.max(sim_data[:, 4]), np.max(true_data[:, 4])


[[  4.09000000e+02   1.56400000e+03   5.50000000e+01   1.33574000e+05
    2.05232563e+02]
 [  4.09000000e+02   1.56400000e+03   1.66000000e+02   1.39751000e+05
    2.41676147e+02]
 [  4.09000000e+02   1.56400000e+03   2.77000000e+02   1.47200000e+05
    2.46048274e+02]
 ..., 
 [  3.52900000e+03   3.12400000e+03   9.43000000e+02   1.30854000e+05
    1.15735328e+02]
 [  3.52900000e+03   3.12400000e+03   1.05400000e+03   1.49887000e+05
    1.12193563e+02]
 [  3.52900000e+03   3.12400000e+03   1.16500000e+03   1.41682000e+05
    1.32270440e+02]]
163.328825321 163.250693751
173.221963518 174.0
482.496698943 507.0

In [4]:
sim_data = run_simulation(true_data, binom, n=n.astype('int'), p=p)
print sim_data
print np.average(sim_data[:, 4]), np.average(true_data[:, 4])
print np.median(sim_data[:, 4]), np.median(true_data[:, 4])
print np.max(sim_data[:, 4]), np.max(true_data[:, 4])


[[  4.09000000e+02   1.56400000e+03   5.50000000e+01   1.33574000e+05
    2.12000000e+02]
 [  4.09000000e+02   1.56400000e+03   1.66000000e+02   1.39751000e+05
    2.00000000e+02]
 [  4.09000000e+02   1.56400000e+03   2.77000000e+02   1.47200000e+05
    2.42000000e+02]
 ..., 
 [  3.52900000e+03   3.12400000e+03   9.43000000e+02   1.30854000e+05
    1.24000000e+02]
 [  3.52900000e+03   3.12400000e+03   1.05400000e+03   1.49887000e+05
    9.10000000e+01]
 [  3.52900000e+03   3.12400000e+03   1.16500000e+03   1.41682000e+05
    1.56000000e+02]]
163.208513709 163.250693751
173.0 174.0
480.0 507.0

Based on these preliminary simulations, it seems that all 3 distributions behave pretty similarly (as expected). For now I'm going to stick with using Poisson distributions because they're nicer mathematically than the binomial and a Poisson is discrete, unlike a Gaussian. Since we are simulating discrete data, i.e. number of synapses, Poisson seems more appropriate.

2) Run the simulation many times and then use the results to construct confidence interval for the mean and median synaptic density.

From original data, we have that the mean synaptic density is 0.00115002980202, the median is 0.00119726911912. Let the signifigance level, alpha, be .01.


In [6]:
observed_mean = 0.00115002980202
observed_median = 0.00119726911912
alpha = .01
n = 10000 # samples of simulated data to obtain
simulations = np.empty((n, true_data.shape[0], true_data.shape[1]))
sims_density = np.empty((n, true_data.shape[0]))

# this will take a long time to run
for i in xrange(n):
    sim_data = run_simulation(true_data, poisson, mu=lambdas)
    simulations[i,:,:]=sim_data
    density_vector = np.apply_along_axis(lambda x:x[4]/x[3], 1, sim_data)
    sims_density[i, :] = density_vector

print simulations[-1, :, :]
print sims_density[-1, :]


[[  4.09000000e+02   1.56400000e+03   5.50000000e+01   1.33574000e+05
    2.06000000e+02]
 [  4.09000000e+02   1.56400000e+03   1.66000000e+02   1.39751000e+05
    2.11000000e+02]
 [  4.09000000e+02   1.56400000e+03   2.77000000e+02   1.47200000e+05
    2.59000000e+02]
 ..., 
 [  3.52900000e+03   3.12400000e+03   9.43000000e+02   1.30854000e+05
    1.04000000e+02]
 [  3.52900000e+03   3.12400000e+03   1.05400000e+03   1.49887000e+05
    1.28000000e+02]
 [  3.52900000e+03   3.12400000e+03   1.16500000e+03   1.41682000e+05
    1.36000000e+02]]
[ 0.00154222  0.00150983  0.00175951 ...,  0.00079478  0.00085398
  0.0009599 ]

In [7]:
avg_density_per_sim = np.empty((n))
median_per_sim = np.empty((n))
for i, dens_vec in enumerate(sims_density[:,]):
    avg_density_per_sim[i] = np.average(dens_vec)
    median_per_sim[i] = np.median(dens_vec)
    
deltas_mean = avg_density_per_sim - np.ones(avg_density_per_sim.shape)*observed_mean
deltas_mean = np.sort(deltas_mean)
deltas_median = median_per_sim - np.ones(median_per_sim.shape)*observed_median
deltas_median = np.sort(deltas_median)

Now we have a sorted list of delta values. At alpha=.01, the fifth and fifth to last in the list are our relevant critical values (b/c (10,000)(alpha/2)=5), thus a confidence interval can be constructed.


In [8]:
critical_995 = deltas_mean[4]
critical_005 = deltas_mean[-5]
ci_mean = (observed_mean-critical_005, observed_mean-critical_995)
critical_995 = deltas_median[4]
critical_005 = deltas_median[-5]
ci_median = (observed_median-critical_005, observed_median-critical_995)
print "observed mean:", observed_mean 
print "confidence interval: ", ci_mean
print "observed median:", observed_median 
print "confidence interval: ", ci_median


observed mean: 0.00115002980202
confidence interval:  (0.0011483607857273254, 0.0011517002511928856)
observed median: 0.00119726911912
confidence interval:  (0.0011987123211926844, 0.0012061442708446745)

Confidence interval for the mean looks good, but note that the observed median falls outside of the confidence interval. This means that all the delta values for the median were negative. In other words for all simulations, the median density was less than the one observed from real data. Let's quickly verify this.


In [9]:
print np.any(median_per_sim < observed_median)


True

This means that our theoretical distribution has less left-skewness than the real data. Recall the 'spike' in the histogram of synaptic density on the real data. This spike is at a higher value than the mean, so the real data has a clearly observable left skew; perhaps our the theoretical model was unable to capture that 'spike'?

3) Use data from previous problem to investigate variance.

First let's consider the standard deviation of density that we estimated from the true data. It is 0.000406563246763. Let's compute this for each simulated sample.


In [10]:
std_per_sim = np.empty(sims_density.shape)
for i, sim in enumerate(sims_density[:,]):
    std_per_sim[i] = np.std(sim)
print np.average(std_per_sim)
print np.median(std_per_sim)
print np.min(std_per_sim), np.max(std_per_sim)


0.000417030480024
0.00041702437564
0.000415448836794 0.000418803711971

While the standard deviations for the simulated data are close to the observed, they are all slightly larger. One possible explanation for this could be due to using Poisson distributions. A Poisson RV has variance equal to $\lambda=np$, while a binomial RV has variance $np(1-p)$. Since $p$ is quite small, the difference is slight, but we can see that the Poisson approximation theoretically has more variance.

We can also use this data to investigate the variance of the observed mean. One way to estimate the std dev of the observed mean is as follows:


In [11]:
sample_std_dev = 0.000406563246763 # the std deviation computed from true data
std_dev_of_mean = sample_std_dev/np.sqrt(true_data.shape[0])
print std_dev_of_mean


2.14170586912e-06

Under our theoretical model, the variances are known, so using an estimate is not needed. That is, we have that $$ Var(\bar x)=Var(\frac{1}{N} \sum x) $$

$$ =\frac{1}{N^2}\sum Var(x) $$

we know the theoretical variance of all of our random variables, since each $S_i \sim \textrm{Pois}(s_i)$, where $S_i$ corresponds to synapses and $s_i$ observed synapses. Thus the variance of the density will be $s_i/(u_i)^2$ where $u_i$ is unmasked.


In [12]:
var_theoretical = true_data[:, 4]/np.square(true_data[:, 3])
var_mean_theoretical = np.sum(var_theoretical)/np.square(true_data.shape[0])
print var_mean_theoretical
print np.sqrt(var_mean_theoretical)


2.39097797972e-13
4.8897627547e-07

4) Constructing a simpler model

The originally proposed model is obviously quite complicated, since we treat each bin as a seperate distribution. Now let's try to make a simpler, yet hopefully still accurate, model. We'll try k-means clustering on the data, but scale the densities up so that they impact the clustering more.


In [13]:
from sklearn.cluster import KMeans

scale_factor = 10**9
data_scaled = np.copy(true_data)
data_scaled[:, 3] = (data_scaled[:, 4]/data_scaled[:, 3])*scale_factor
data_scaled = data_scaled[:, [0, 1, 2, 3]]

kmeans = KMeans(4)
labels = kmeans.fit_predict(data_scaled)
clusters = [[] for i in range(4)]

for data, label in zip(data_scaled[:, ], labels):
    clusters[label].append(data)
    
for cluster in clusters:
    cluster = np.array(cluster)
    cluster[:, -1] = cluster[:, -1]/scale_factor
    print cluster.shape
    print np.mean(cluster[:, -1])
    print np.std(cluster[:, -1])
    print np.min(cluster[:, -1]), np.max(cluster[:, -1])


(14391, 4)
0.0012840360023
0.000102783294679
0.00110561914672 0.00147214697915
(9833, 4)
0.000927575497373
0.000124359741773
0.000658359952208 0.00110549235492
(4541, 4)
0.000389640585865
0.000200732403866
0.0 0.000658224674022
(7271, 4)
0.00166052868523
0.000167487105946
0.00147236682968 0.00338020281217

Looking at the max and min values for the clusters of the density, we can see that the clusters formed a pretty solid partition of the densities ranges. The first cluster occupies the upper range from 0.0014.... to 0.0033..., the third cluster occupies the lowest range, from 0 to 0.0006..., the second goes from approximately where the third's range ended, 0.0006..., to 0.0011..., and similarly the 4th cluster picks up at the end of the third clusters range and ends at the beginning of the first's.

Let's try and use 4 poissons to model the data, based on this clustering.


In [29]:
clusters = [[] for i in range(4)]
for data, label in zip(true_data[:, ], labels):
    clusters[label].append(data)

# estimate lambda for each cluster by averaging synapses
syn_avgs = [] 
for cluster in clusters:
    cluster = np.array(cluster)
    syn_avgs.append(np.average(cluster[:, -1]))

lambdas = np.empty((true_data.shape[0]))
for i, label in enumerate(labels):
    lambdas[i] = syn_avgs[label]
    
sim_data = run_simulation(true_data, poisson, mu=lambdas)
print np.average(sim_data[:, 4]), np.average(true_data[:, 4])
print np.median(sim_data[:, 4]), np.median(true_data[:, 4])
print np.max(sim_data[:, 4]), np.max(true_data[:, 4])
print np.std(sim_data[:, 4]), np.std(true_data[:, 4])


163.158230658 163.250693751
178.0 174.0
306.0 507.0
64.0516808538 68.9942883318

Mean, median, and std dev of the simulated data are quite close to the true data, although the max value is not. This not terrible, necessarily though, because the max value in the true data is likely to be an outlier.

5) Tests Comparing Models and True Data

Now we'll compare the two models with the true data as well as with themselves using a chisquared test.


In [30]:
from scipy.stats import chisquare

#true_data[:, 4] += 5
#sim_data[:, 4] += 5
# sim_data1[:, 4] += 5
#print chisquare(true_data[:, 4], sim_data[:, 4])
#print chisquare(true_data[:, 4], sim_data1[:, 4])

a = np.where(true_data[:, 3] >= np.average(true_data[:, 3]))
true_data = true_data[a]
sim_data = sim_data[a]
sim_data1 = sim_data1[a]
print chisquare(true_data[:, 4], sim_data[:, 4])


Power_divergenceResult(statistic=129385.88694048965, pvalue=0.0)

Note that the comlpex model is extremely close to the true data (high p-value for chisquare test) while the simplified one is not (low p-value).


In [ ]: