In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
# a new addition to our standard imports
import scipy.stats as stats
In [2]:
# scipy.stats.norm.rvs when used like this is pretty much
# equivalent to numpy.random.normal which we've used previously
x = stats.norm.rvs(loc=0, scale=1, size=1000)
In [3]:
# when we call stats.norm like this we're creating
# a "frozen distribution" -- essentially an object
# representing the normal distribution with the given
# parameters. In this case we're specificing the so-called
# "standard normal" distribution -- i.e the normal distribution
# with mean = 0, std dev = 1
STDNORM = stats.norm(loc=0,scale=1)
# now we can get random values from our frozen distribution
y = STDNORM.rvs(size=1000)
In [4]:
# to draw the probability density function (pdf) for our normal
# distribution, we need a set of values of x where we'll evaluate
# the function. numpy.linspace generate the specified number of
# points (in this case 500) within the given interval (-6,6 in
# this example)
xs = np.linspace(-6,6,500)
# we evaluate our pdf at the specified x-values to get back
# the corresponding densities
fx = STDNORM.pdf(xs)
In [5]:
# create a histogram of our sample from the standard normal
plt.hist(y,alpha=0.5,normed=True, # note the used of normed=True to generate
bins=25,label='sample') # a density histogram
# plot the pdf for the standard normal
plt.plot(xs, fx, color='red',
linestyle='dashed', linewidth=2,
label='population')
# note the use of the label arguments above to create
# labels for the legend; by default the legend() fxn
# tries to find the best placement of the legend so as
# not to minimally interfere with the plotted data
plt.legend()
plt.xlabel("Random Variate")
plt.ylabel("Density")
pass
In [6]:
# let's generate the pdf for the SAT example
# we covered in lecture, which was assumed to be
# N(mu=1500,sigma=300)
SAT = stats.norm(loc=1500, scale=300)
xsat = np.linspace(600,2400,500)
fsat = SAT.pdf(xsat)
plt.plot(xsat,fsat,'black')
plt.xlabel("SAT scores")
plt.ylabel("Density")
pass
In [7]:
# In the example from the slides, Pam had a score of 1800
# on her SATs. We can use the scipy.stats.norm.cdf fxn to
# calculate her percentile. cdf returns the cumulative probability to
# the left of the given point. Strictly speaking, to turn
# this into a percentile we need to multiply by hundred
pamcdf = SAT.cdf(1800)
print("Pam's percentile is", pamcdf * 100)
In [8]:
# Let's redraw the PDF, illustrating how to draw a line at
# Pam's score (1800) and shade the area to the left of it
# draw entire pdf first
# note xsat and fsat were defined a couple cells above
plt.plot(xsat, fsat, color='black')
# plot vertical line at 1800 extending from 0 to pdf(1800)
plt.vlines(1800, 0, SAT.pdf(1800), linestyle='dashed', color='k')
# draw area under curve from 600 to 1800
xtoleft = np.linspace(600,1800,500)
ftoleft = SAT.pdf(xtoleft)
plt.fill_between(xtoleft, np.zeros_like(ftoleft), ftoleft, color='gray', alpha=0.75)
plt.xlabel("SAT scores")
plt.ylabel("Density")
pass
In [9]:
# Another example we looked at in class involved
# quality control at a ketchup factory. The volume
# of ketchup in bottles was ~ N(36 oz,0.11 oz). We first
# asked what the probability was that a sampled bottle
# had <= 35.8 oz of ketchup
kmu, kstd = 36, 0.11
ketchup = stats.norm(loc=kmu, scale=kstd)
ketchup.cdf(35.8)
Out[9]:
In [10]:
# then we asked what fraction of bottles had between
# 35.8 and 36.2 oz of ketchup. Here's one way to calculate
# this value. Note the introduction of the scipy.stats.norm.sf
# function (also called the "survival function" = 1 - cdf)
1 - ketchup.cdf(35.8) - ketchup.sf(36.2)
Out[10]:
In [11]:
# let's illustrate this last example with one more figure.
# We'll make this figure slightly fancier by changes its aspect
# ratio, customizing the axes, etc.
# set figure size -- (length, width) in inches
fig = plt.figure(figsize=(10,4))
xvol = np.linspace(kmu - 4*kstd, kmu + 4*kstd, 500)
fvol = ketchup.pdf(xvol)
plt.plot(xvol, fvol, color='black')
# shade area representing fraction of bottles that
# are expected to pass inspection (btw 35.8 and 36.2)
xpass = np.linspace(35.8,36.2,500)
fpass = ketchup.pdf(xpass)
plt.fill_between(xpass, np.zeros_like(fpass), fpass, color='gray', alpha=0.75)
plt.xlabel("Volume (oz)")
plt.ylabel("Density")
#---------------------------------------------------------
# Below here we're just customizing the look of the figure
# get rid of yticks
plt.yticks([])
# get current axes
ax = plt.gca()
# draw xsticks only at the bottom
ax.xaxis.set_ticks_position('bottom')
# remove left, right, and top "spines" surrounding the plot
ax.spines["left"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
pass
In [12]:
# if you find yourself drawing lots of figures like the one above
# you should write a couple of functions to encapsulate the key steps
def norm_plot(mu, sigma, nstds=4, npts=500, **kw):
"""Draw the probability density fxn for N(mu,sigma).
nstds = how many standard deviation to left/right to draw pdf
npts = number of pts over which to evaluate the pdf
**kw = any additional keywords are passed along to pyplot.plot
"""
distribution = stats.norm(loc=mu, scale=sigma)
xmin, xmax = mu-nstds*sigma, mu+nstds*sigma
x = np.linspace(xmin,xmax,npts)
y = distribution.pdf(x)
plt.plot(x, y, **kw) # notice how we pass any additional keywords to the plot fxn
# make it look nice
plt.ylim(0, max(y)*1.1)
plt.yticks([])
ax = plt.gca()
ax.xaxis.set_ticks_position('bottom')
ax.spines["left"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
plt.ylabel("Density")
# return the "frozen" distribution and the axes representing our plot
return distribution, ax
def area_under_distn(distribution, xmin, xmax, npts=500, **kw):
"""Draw the area under the pdf of the given distribution from xmin, xmax.
distribution = a frozen distribution from scipy.stats
"""
x = np.linspace(xmin,xmax,npts)
y = distribution.pdf(x)
plt.fill_between(x, np.zeros_like(y), y, **kw)
In [13]:
# here's how we can use our two fxns defined above to quickly create
# some nice figures with a minimum of code. We'll illustrate it by drawing
# a figure showing the fraction of bottles that are expected NOT to pass
# inspection (the inverse of the previous figure above)
mu, sigma = 36, 0.11
plt.figure(figsize=(10,4))
distn, ax = norm_plot(mu, sigma, color='k')
area_under_distn(distn, mu-4*sigma, 35.8, color='firebrick', alpha=0.5)
area_under_distn(distn, 36.2, mu+4*sigma, color='firebrick', alpha=0.5)
# tweak the plot with labels and additional text
plt.xlabel("Volume (oz)")
# draw some text on the plot. We use the markup language
# LaTeX to draw a nicely formatted shorthand formula
plt.text(36.2, 2, "$X \sim N(36,0.11)$", fontsize=18)
pass
Color arguments to Matplotlib function support HTML named colors like 'firebrick'
above. You can also specify colors use Hexadecimal numbers (also common in web programming) or RGB values using a tuple of values in the range 0-1. For more details see the Matplotlib color documentation.
In [ ]: