In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
from pandas import Series, DataFrame
import numpy.random as rnd
import scipy.stats as st
In [2]:
rnd.randn(10)
Out[2]:
In [3]:
def plot_cdf(data, plot_range=None, scale_to=None, **kwargs):
num_bins = len(data)
sorted_data = np.array(sorted(data), dtype=np.float64)
data_range = sorted_data[-1] - sorted_data[0]
counts, bin_edges = np.histogram(sorted_data, bins=num_bins)
xvalues = bin_edges[1:]
yvalues = np.cumsum(counts)
if plot_range is None:
xmin = sorted_data[0]
xmax = sorted_data[-1]
else:
xmin, xmax = plot_range
#pad the arrays
xvalues = np.concatenate([[xmin, xvalues[0]], xvalues, [xmax]])
yvalues = np.concatenate([[0.0, 0.0], yvalues, [yvalues.max()]])
if scale_to is not None:
yvalues = yvalues / len(data) * scale_to
#print xvalues.shape, yvalues.shape
return plt.plot(xvalues, yvalues, **kwargs)
In [4]:
plot_cdf(rnd.randn(50), [-3, 3], 1.0, lw=2)
for v in [0.25, 0.5, 0.75]:
plt.axhline(v, lw=2, ls='--', color='black')
In [5]:
wingLens = np.fromfile('s057.txt', sep='\n', dtype=np.float64)
plot_cdf(wingLens, [30, 60], 100, lw=2)
plt.xlabel('housefly wing length (x.1mm)')
plt.ylabel('percent')
Out[5]:
In [6]:
mean, std = st.norm.fit(wingLens)
rvNorm = st.norm(loc=mean, scale=std)
xx = np.linspace(mean-3*std, mean+3*std)
plt.figure(figsize=(8, 3))
plt.subplot(1, 2, 1)
plt.plot(xx, rvNorm.cdf(xx), lw=3)
plot_cdf(wingLens, scale_to=1, lw=2)
plt.title('cdf simulation')
plt.subplot(1, 2, 2)
st.probplot(wingLens, dist='norm', plot=plt)
Out[6]:
In [7]:
N = 4857
mean = 63.8
serr = 0.06
std = serr * np.sqrt(N)
rvNorm = st.norm(loc=mean, scale=std)
xmin, xmax = mean-3*std, mean+3*std
xx = np.linspace(xmin, xmax, 200)
plt.figure(figsize=(8, 3))
plt.subplot(1, 2, 1)
plt.plot(xx, rvNorm.cdf(xx))
plt.title('cdf')
plt.subplot(1, 2, 2)
plt.plot(xx, rvNorm.pdf(xx))
plt.title('pdf')
Out[7]:
In [8]:
categories = [
('petite', 59, 63),
('average', 63, 68),
('tall', 68, 71)
]
for cat, h1, h2 in categories:
print '%8s: %.3f' % (cat, rvNorm.cdf(h2)-rvNorm.cdf(h1))
In [9]:
too_short = rvNorm.cdf(59)
too_tall = 1-rvNorm.cdf(71)
print too_short, too_tall
print rvNorm.ppf(0.25), rvNorm.ppf(0.75)
print rvNorm.stats(moments='mvks') # mean, variance, skew, kurtosis
In [10]:
beta, eta = 1.5, 1.0
rvWb = st.weibull_min(beta, scale=eta)
wbrvs = rvWb.rvs(200)
xx = np.linspace(wbrvs.min(), wbrvs.max(), 200)
plt.figure(figsize=(8, 3))
plt.subplot(1, 2, 1)
plt.plot(xx, rvWb.cdf(xx))
plt.title('cdf')
plt.subplot(1, 2, 2)
plt.plot(xx, rvWb.pdf(xx))
plt.title('pdf')
Out[10]:
In [11]:
df = DataFrame(wbrvs, columns=['wbrvs'])
df.hist(bins=30)
Out[11]:
In [12]:
plt.plot(xx, rvWb.cdf(xx), lw=3)
plot_cdf(wbrvs, scale_to=1, lw=2)
plt.title('cdf simulation')
Out[12]:
In [13]:
rvBinom = st.binom(20, 0.5)
print 'P(wins 12 out of 20 in the game) = ', rvBinom.pmf(12)
print 'P(wins at most 7 out of 20 in the game) = ', rvBinom.cdf(7)
In [19]:
xx = np.arange(21)
plt.figure(figsize=(9, 4))
plt.subplot(1, 2, 1)
plt.step(xx, rvBinom.cdf(xx))
plt.title('binomial cdf $N=20$, $p=0.5$')
plt.subplot(1, 2, 2)
plt.bar(xx, rvBinom.pmf(xx))
plt.title('binomial pmf $N=20$, $p=0.5$')
Out[19]:
In [30]:
# the birth of normal distribution
mean, std = 20*0.5, np.sqrt(20*0.5*0.5)
rvNorm = st.norm(mean, scale=std)
xn = np.linspace(mean-3*std, mean+3*std, 200)
plt.figure(figsize=(9, 4))
plt.subplot(1, 2, 1)
plt.step(xx, rvBinom.cdf(xx), color='LightSteelBlue')
plt.plot(xn, rvNorm.cdf(xn), color='red')
plt.subplot(1, 2, 2)
plt.bar(xx, rvBinom.pmf(xx), color='lightSteelblue')
plt.plot(xn, rvNorm.pdf(xn), color='red')
Out[30]:
In [32]:
rvBinorm = st.multivariate_normal(mean=[0, 0]).rvs(300)
df = DataFrame(rvBinorm, columns=['Z1', 'Z2'])
print df.head()
In [33]:
df.plot(kind='scatter', x='Z1', y='Z2')
plt.title('multivariate gaussian distribution')
Out[33]:
In [ ]: