In [143]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline

In [144]:
n = 5000
samp1 = np.random.randn(n)
samp2 = np.random.randn(n)

plt.hist(samp1, bins=20, color='blue')
plt.hist(samp2, bins=20, color='green', alpha=0.5)
plt.show()



In [145]:
plt.boxplot([samp1, samp2], vert=False)

plt.show()



In [146]:
print np.median(samp1)
print np.median(samp2)


-0.0133045687188
-0.0153377573772

In [147]:
st.mannwhitneyu(samp1, samp2)


Out[147]:
(12472096.0, 0.42335752845465102)


In [148]:
n = 5000
normal_population = np.random.randn(n)

print "normal_population mean = {0}".format(np.mean(normal_population))
print "normal_population std = {0}".format(np.std(normal_population))

plt.hist(normal_population)
plt.show()


normal_population mean = -0.0093684599912
normal_population std = 1.000391475

In [149]:
W, p = st.shapiro(normal_population)

print "W = {0}".format(W)
print "p = {0}".format(p)
print "p < 0.05: " + str(p < 0.05)


W = 0.999568283558
p = 0.340437620878
p < 0.05: False

In [150]:
st.probplot(normal_population, plot=plt)
plt.title(r'Probability Plot: Normal Population')
plt.show()


Verify assumption that obtained simple random samples are independent by making sure $n \lt 0.05N$.


In [151]:
sample_sizes = [30, 50, 100, 200]

def xbar_trials(n):
    num_of_samples = 1000
    samples = []

    for i in np.arange(num_of_samples):
        samples.append(np.random.choice(normal_population, n, replace=False))

    xbar = []
    for sample in samples:
        xbar.append(np.mean(sample))

    print "n = {0}".format(n)
    print "xbar mean = {0}".format(np.mean(xbar))
    print "xbar std = {0}".format(np.std(xbar))
    print "standard error of the mean = {0}".format(np.true_divide(np.std(normal_population), np.sqrt(n)))

    plt.hist(xbar)
    plt.show()

for n in sample_sizes:
    xbar_trials(n)


n = 30
xbar mean = -0.0143859446718
xbar std = 0.186019423527
standard error of the mean = 0.182645659064
n = 50
xbar mean = -0.0156391368293
xbar std = 0.138470152089
standard error of the mean = 0.141476719162
n = 100
xbar mean = -0.00450353967756
xbar std = 0.0978223340057
standard error of the mean = 0.1000391475
n = 200
xbar mean = -0.0136456035503
xbar std = 0.0692339038936
standard error of the mean = 0.0707383595812

In [152]:
W, p = st.shapiro(xbar)

print "W = {0}".format(W)
print "p = {0}".format(p)
print "p < 0.05: " + str(p < 0.05)


W = 0.998783588409
p = 0.744193613529
p < 0.05: False

In [153]:
st.probplot(xbar, plot=plt)
plt.title(r'Probability Plot: $\bar{x}$')
plt.show()



In [154]:
U, p = st.mannwhitneyu(normal_population, xbar)

print "U = " + str(U)
print "p = " + str(p)


U = 2472651.0
p = 0.29221532731

In [155]:
st.pearsonr(np.random.choice(normal_population,1000),xbar)


Out[155]:
(0.036421701889348498, 0.2498570714717871)

In [156]:
st.pearsonr(xbar,xbar)


Out[156]:
(1.0, 0.0)

In [160]:
st.wilcoxon(xbar,xbar)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-160-5604cf31c69d> in <module>()
----> 1 st.wilcoxon(xbar,xbar)

/usr/local/lib/python2.7/site-packages/scipy/stats/morestats.pyc in wilcoxon(x, y, zero_method, correction)
   1976         r = r[d != 0]
   1977 
-> 1978     replist, repnum = find_repeats(r)
   1979     if repnum.size != 0:
   1980         # Correction for repeated elements.

/usr/local/lib/python2.7/site-packages/scipy/stats/stats.pyc in find_repeats(arr)
    257 
    258     """
--> 259     v1,v2, n = futil.dfreps(arr)
    260     return v1[:n],v2[:n]
    261 

ValueError: failed to create intent(cache|hide)|optional array-- must have defined dimensions but got (0,)

In [161]:
t1 = [10, 10, 80, 80, 160, 160, 160, 160, 160, 320, 320]
t2 = [320, 320, 320, 320, 640, 640, 640, 640, 640, 1280, 1280]
st.mannwhitneyu(t1, t2)


Out[161]:
(4.0, 8.4619136067419631e-05)

In [162]:
m1 = [7, 7, 33, 4, 20, 4, 59, 91, 5, 76, 287, 472, 52, 19, 128, 28, 103, 25, 68, 17, 109, 3]
m2 = [115, 412, 200, 55, 62, 253, 219, 225, 122, 245, 129, 168, 239, 71, 118, 130, 12]
st.mannwhitneyu(m1, m2)


Out[162]:
(68.0, 0.00039467323850771424)

In [165]:
st.ttest_ind(m1, m2)


Out[165]:
(-2.6524265768258233, 0.011701124844884635)

In [164]:
st.ttest_ind(m1, m2, equal_var=False)


Out[164]:
(-2.6941366224324623, 0.010628198674800927)