In [1]:
# This line tells matplotlib to include plots here
% matplotlib inline
import numpy as np # We'll need numpy later
from scipy.stats import kstest, ttest_ind, ks_2samp, zscore
import matplotlib.pyplot as plt # This lets us access the pyplot functions
Let us assume that a true distribution of a process is described by the normal distribution with $\mu=5$ and $\sigma=1$. You have a measurement technique that allows you to sample n points from this distribution. In Matlab this is a random number generator whose numbers will be chosen from the desired normal distribution by using the call normrnd(mu, sigma, [1, n])
. Sample from this normal distribution from n=1 to 50 (I.e. n=1:50). Create a plot for the standard deviation of the calculated mean from each n when you repeat the sampling 1000 times each. (i.e. You will repeat your n observations 1000 times and will calculate the sample mean for each of the 1000 trials).
In [2]:
# Initial code here
numRepeats = 1000
mu, sigma = 5.0, 1.0
n = 50
sampleMean = np.empty((n, numRepeats))
nVec = np.array(range(1, n+1))
for i in range(numRepeats):
for j in range(n):
sampleMean[j, i] = np.mean(np.random.normal(loc=mu, scale=sigma, size=(j + 1, )))
In [3]:
# Answer to 1a here
plt.plot(nVec, np.std(sampleMean, axis=1), label='stddev sample mean')
plt.plot(nVec, 1./np.sqrt(nVec), 'r', label='1/sqrt(n)');
plt.title('Stdev of mean estimate v. n, 100 trials');
plt.ylabel('Stdev');
plt.xlabel('n');
This shows that the standard deviation of the sample mean (also called the standard error) follows a $1/\sqrt{n}$ relationship.
In [4]:
# Answer to 1b here
plt.boxplot(np.transpose(sampleMean));
plt.ylabel('Values');
plt.xlabel('n');
The box plot shows that the values with higher n converge on the true mean (5 in this case). The box plot shows the overall distribution of values in greater detail, while the standard plot is easier to read for a single value plotted over the x-axis.
In [5]:
# Answer to 1c here
sampleVec = sampleMean[4, :]
plt.hist(sampleVec)
plt.xlabel('Value')
plt.ylabel('Density')
# Normalize sample mean and stdev
P = kstest(zscore(sampleVec), 'norm')[1]
print(P)
We do not reject the null hypothesis.
In [6]:
# Answer to 1d here
sampleVec = sampleMean[21, :]
plt.hist(sampleVec)
plt.xlabel('Value')
plt.ylabel('Density')
# Normalize sample mean and stdev
P = kstest(zscore(sampleVec), 'norm')[1]
print(P)
Nothing changes in this case. For both low and high n the sample mean is normally distributed.
In [7]:
# Answer 2a here
Wvec = np.random.weibull(1, size=(1000, ))
plt.hist(Wvec);
Doesn't look anything like a normal distribution. Much heavier right tail.
In [8]:
# Answer 2b here
sampleMean = np.empty((1000, n))
for i in range(sampleMean.shape[0]):
for j in range(n):
sampleMean[i, j] = np.mean(np.random.weibull(1.0, size=(j + 1, )))
plt.boxplot(sampleMean);
In [9]:
nVec = np.arange(1, n+1)
plt.plot(nVec, np.std(sampleMean, axis=0), label='stddev sample mean')
plt.plot(nVec, 1./np.sqrt(nVec), 'r', label='1/sqrt(n)');
plt.title('Stdev of mean estimate v. n, 1000 trials');
plt.ylabel('Stdev');
plt.xlabel('n');
Doesn't differ from 1b.
In [10]:
sampleVec = sampleMean[2, :]
plt.hist(sampleVec)
plt.xlabel('Value')
plt.ylabel('Density')
# Normalize sample mean and stdev
Pnorm = kstest(zscore(sampleVec), 'norm')[1]
Pweib = kstest(sampleVec, 'expon')[1]
# Scipy and numpy's weibull distributions are different, but a weibull(a=1)
# is the same as an exponential distribution.
print(Pnorm)
print(Pweib)
This distribution is closer to normal.
In [11]:
sampleVec = sampleMean[19, :]
plt.hist(sampleVec)
plt.xlabel('Value')
plt.ylabel('Density')
# Normalize sample mean and stdev
Pnorm = kstest(zscore(sampleVec), 'norm')[1]
Pweib = kstest(zscore(sampleVec), 'expon')[1]
print(Pnorm)
print(Pweib)
Also looks normally distributed.
In [12]:
# Answer to 2f
# The distribution changes shape but the same outcomes, of the sampling distribution
# morphing to look normally distributed as N increases, hold.
In [13]:
dOne = lambda n: np.random.normal(loc=1.0, scale=1.0, size=(n, ))
dTwo = lambda n: np.random.normal(loc=3.0, scale=1.0, size=(n, ))
Hint: It'd be helpful to define a function that does this for you at this point.
In [14]:
def falseNeg(n=3, nTrial=100, p=0.05):
compare = np.empty((nTrial, ))
for ii, _ in enumerate(compare):
compare[ii] = ttest_ind(dOne(n), dTwo(n), equal_var=False)[1]
return sum(compare > p)
print(falseNeg())
In reality these are two distributions, so the test has missed an occasion when we should have rejected the null hypothesis.
In [15]:
def falsePos(n=3, nTrial=100, p=0.05):
compare = np.empty((nTrial, ))
for ii in range(len(compare)):
compare[ii] = ttest_ind(dOne(n), dOne(n), equal_var=False)[1]
return sum(compare < p)
print(falsePos())
These are the same distribution, so the null hypothesis is true.
In [16]:
print(falsePos(nTrial=1000))
print(falsePos(nTrial=10000))
Number of false positives is p-value * trials, so proportional to the number of trials run.
In [17]:
nVec = np.array(range(3, 31))
fPos = np.empty(nVec.shape)
fNeg = np.empty(nVec.shape)
for nn, nItem in enumerate(nVec):
fPos[nn] = falsePos(n=nItem)
fNeg[nn] = falseNeg(n=nItem)
plt.plot(nVec, fPos);
plt.plot(nVec, fNeg);
Number of false positives is not dependent upon $n$, while the number of false negatives is.
In [18]:
dThree = lambda n: np.random.normal(loc=3.0, scale=2.0, size=(n, ))
def falseNegB(n=3, nTrial=100):
compare = np.empty((nTrial, ))
for ii, _ in enumerate(compare):
compare[ii] = ttest_ind(dOne(n), dThree(n), equal_var=False)[1]
return sum(compare > 0.05)
print(falseNegB())
The number of false negatives increases with sigma.
In [19]:
nVec = np.array(range(3, 31))
fPos = np.empty(nVec.shape)
fNeg = np.empty(nVec.shape)
for nn, nItem in enumerate(nVec):
fPos[nn] = falsePos(n=nItem, p=0.01)
fNeg[nn] = falseNeg(n=nItem, p=0.01)
plt.plot(nVec, fPos);
plt.plot(nVec, fNeg);
This decreases the number of false positives but increases the number of false negatives.
In this excercise we're going to explore some basic concepts of statistics, and use them to build up to some more advanced ideas. To examine these ideas we're going to consider a classic of molecular biology—the Luria-Delbrück experiment.
In [20]:
repOne = np.loadtxt("data/wk2/expt_rep1.csv")
repTwo = np.loadtxt("data/wk2/expt_rep2.csv")
Fill in the function below keeping track of normal and mutant cells. Then, make a second function, CVofNRuns
, that runs the experiment 3000 times. You can assume a culture size of 120000 cells, and mutation rate of 0.0001 per cell per generation. What does the distribution of outcomes look like?
In [21]:
# Runs the simulation a bunch of times, and looks for how often the fano (cv/mean) comes out to one side
def simLuriaDelbruck(cultureSize, mutationRate):
nCells, nMuts = 1, 0 # Start with 1 non-resistant cell
for _ in range(np.int(np.floor(np.log2(cultureSize)))): # num of gens
nCells = 2 * nCells # Double the number of cells, simulating division
newMuts = np.random.poisson(nCells * mutationRate) # de novo
nMuts = 2 * nMuts + newMuts # Previous mutants divide and add
nCells = nCells - newMuts # Non-resistant pop goes down by newMuts
return nMuts
def CVofNRuns(N, cultureSize, mutationRate):
return np.fromiter((simLuriaDelbruck(cultureSize, mutationRate) for x in range(N)), dtype = np.int)
cvs = CVofNRuns(3000, 120000, 0.0001)
plt.hist(cvs, bins=30);
Hint: Each experiment varies slightly in the amount of time it was run. The absolute values of the numbers doesn't matter, so much as the variation of them. You'll need to correct for this by dividing by the mean of the results.
In [22]:
ks_2samp(repOne/np.mean(repOne), repTwo/np.mean(repTwo))
Out[22]:
In [23]:
ks_2samp(repOne/np.mean(repOne), cvs/np.mean(cvs))
Out[23]:
Could loop across different values of them, to check that you always get the same outcome.