In [16]:
import math
import numpy as np

import nsfg
import first
import thinkstats2
import thinkplot

from collections import defaultdict

probability mass function - maps each value to its probability. Alows you to compare two distributions independently from sample size.

probability - frequency expressed as a fraction of the sample size, n.

normalization - dividing frequencies by n.

given a Hist, we can make a dictionary that maps each value to its probability:

n = hist.Total()
d = {}
or x, freq in hist.Items():
    d[x] = freq/n

In [2]:
import thinkstats2
pmf = thinkstats2.Pmf([1,2,2,3,5])

#getting pmf values
print pmf.Items()
print pmf.Values()
print pmf.Prob(2)
print pmf[2]

#modifying pmf values
pmf.Incr(2, 0.2)
print pmf.Prob(2)

pmf.Mult(2, 0.5)
print pmf.Prob(2)

#if you modify, probabilities may no longer add up to 1
#to check:
print pmf.Total()

print pmf.Normalize()
print pmf.Total()

#Copy method is also available


[(1, 0.2), (2, 0.4), (3, 0.2), (5, 0.2)]
[1, 2, 3, 5]
0.4
0.4
0.6
0.3
0.9
0.9
1.0

To plot a PMF:

  • bargraph using thinkplot.Hist
  • as step function: thinkplot.Pmf--for use when large number of smooth values.

In [3]:
from probability import *
live, firsts, others = first.MakeFrames()

first_pmf = thinkstats2.Pmf(firsts.prglngth, label="firsts")
other_pmf = thinkstats2.Pmf(others.prglngth, label="others")
width = 0.45

#cols option makes grid of figures.
thinkplot.PrePlot(2, cols=2)
thinkplot.Hist(first_pmf, align='right', width=width)
thinkplot.Hist(other_pmf, align='left', width=width)
thinkplot.Config(xlabel='weeks',
                 ylabel='probability',
                 axis=[27,46,0,0.6])

#second call to preplot resets the color generator
thinkplot.PrePlot(2)
thinkplot.SubPlot(2)
thinkplot.Pmfs([first_pmf, other_pmf])
thinkplot.Config(xlabel='weeks',
                 ylabel='probability',
                 axis=[27,46,0,0.6])
thinkplot.Show()


nsfg.py:42: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan

Good idea to zoom in on the mode, where the biggest differences occur:


In [4]:
weeks = range(35, 46)
diffs = []
for week in weeks:
    p1 = first_pmf.Prob(week)
    p2 = other_pmf.Prob(week)
    #diff between two points in percentage points
    diff = 100 * (p1 - p2)
    diffs.append(diff)
    
thinkplot.Bar(weeks, diffs)
thinkplot.Config(title="Difference in PMFs",
               xlabel="weeks",
               ylabel="percentage points")
thinkplot.Show()


/Users/davidgoldberg/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.py:475: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

Class Size Paradox


In [6]:
d = {7:8, 12:8, 17:14, 22:4, 27:6, 
     32:12, 37:8, 42:3, 47:2}
pmf = thinkstats2.Pmf(d, label='actual')
print ('mean', pmf.Mean())


('mean', 23.692307692307693)

For each class size, x, in the following funtion, we multiply the probability by x, the number of students who observe that class size. This gives a biased distribution


In [11]:
def BiasPmf(pmf, label):
    new_pmf = pmf.Copy(label=label)
    
    for x, p in pmf.Items():
        new_pmf.Mult(x, x)
        
    new_pmf.Normalize()
    return new_pmf

thinkplot.PrePlot(2)
biased_pmf = BiasPmf(pmf, label="observed")
thinkplot.Pmfs([pmf, biased_pmf])
thinkplot.Config(root='class_size1',
               xlabel='class size',
               ylabel='PMF',
               axis=[0, 52, 0, 0.27])
# thinkplot.Show()

print "actual mean", pmf.Mean()
print "biased mean", biased_pmf.Mean()


 actual mean 23.6923076923
biased mean 29.1233766234

Conclusion: the students are biased because the amount of students in a large class is large, so students who are taking multiple classes are likely taking at least one of these classes, which offsets their personal average class size from the actual.

Think of it this way: if you had one of each class size in range of class sizes from 1 to 10, the average size of the classes would be 5, but far more people would report being in a larger class than being in a smaller class.

this can be corrected, however...


In [14]:
def UnbiasPmf(pmf, label):
    new_pmf = pmf.Copy(label=label)
    
    for x, p in pmf.Items():
        new_pmf.Mult(x, 1.0 / x)
    new_pmf.Normalize()
    return new_pmf
print 'unbiased mean:', UnbiasPmf(biased_pmf, "unbiased").Mean()


 unbiased mean: 23.6923076923

DataFrame indexing:


In [19]:
import numpy as np
import pandas
array = np.random.randn(4,2)

df = pandas.DataFrame(array)
df


Out[19]:
0 1
0 -0.599691 0.435165
1 -0.694101 1.227066
2 -0.230580 0.579704
3 -0.129657 0.736377

In [21]:
columns = ['A','B']
df = pandas.DataFrame(array, columns=columns)
df


Out[21]:
A B
0 -0.599691 0.435165
1 -0.694101 1.227066
2 -0.230580 0.579704
3 -0.129657 0.736377

In [25]:
index = ['a','b','c','d']
df = pandas.DataFrame(array, columns=columns, index=index)
df


Out[25]:
A B
a -0.599691 0.435165
b -0.694101 1.227066
c -0.230580 0.579704
d -0.129657 0.736377

In [26]:
#to select a row by label, use loc, 
#which returns a series
df.loc['a']


Out[26]:
A   -0.599691
B    0.435165
Name: a, dtype: float64

In [27]:
#iloc finds a row by integer position of the row
df.iloc[0]


Out[27]:
A   -0.599691
B    0.435165
Name: a, dtype: float64

In [28]:
#loc can also take a list of labels
#in this case it returns a df
indices = ['a','c']
df.loc[indices]


Out[28]:
A B
a -0.599691 0.435165
c -0.230580 0.579704

In [32]:
#slicing
#NOTE: first slice method selects inclusively
print df['a':'c']
df[0:2]


          A         B
a -0.599691  0.435165
b -0.694101  1.227066
c -0.230580  0.579704
Out[32]:
A B
a -0.599691 0.435165
b -0.694101 1.227066

Exercise 3.2

PMFs can be used to calculate probability:
$$ \bar{x} = \sum_{i}p_ix_i $$

where $x_i$ are the unique values in the PMF and $p_i = PMF(x_i)$

Variance can also be calulated:

$$ S^2 = \sum_{i}p_i(x_i -\bar{x})^2 $$

Write functions PmfMean and PmfVar that take a Pmf object and compute the mean and variance.


In [37]:
def PmfMean(pmf):
    mean = 0
    for key, prob in pmf.Items():
        mean += key * prob
    return mean

def PmfVar(pmf):
    mean = PmfMean(pmf)
    var = 0
    for key, prob in pmf.Items():
        var += prob * (key - mean) ** 2
    return var

print "my Mean:", PmfMean(pmf)
print "answer mean:", pmf.Mean()
print "my Variance:", PmfVar(pmf)
print "answer variance:", pmf.Var()


 my Mean: 23.6923076923
answer mean: 23.6923076923
my Variance: 128.674556213
answer variance: 128.674556213

Exercise 3.3


In [6]:
df = nsfg.ReadFemPreg()
pregMap = nsfg.MakePregMap(df[df.outcome==1])

In [29]:
lengthDiffs = []
for caseid, pregList in pregMap.iteritems():
    first = df[df.index==pregList[0]].prglngth
    first = int(first)
    for idx in pregList[1:]:
        
        other = df[df.index==idx].prglngth
        other = int(other)
        diff = first - other
        lengthDiffs.append(diff)
diffHist = thinkstats2.Hist(lengthDiffs)
print diffHist


Hist({0: 2543, 1: 327, 2: 353, 3: 156, 4: 150, 5: 66, 6: 34, 7: 22, 8: 17, 9: 36, 10: 16, 11: 6, 12: 3, 13: 8, 14: 3, 15: 4, 16: 1, 17: 3, 19: 2, 20: 1, 22: 1, 25: 1, 26: 1, 31: 1, 35: 1, -1: 247, -39: 1, -16: 1, -15: 4, -14: 1, -13: 14, -12: 8, -11: 9, -10: 13, -9: 55, -8: 22, -7: 25, -6: 30, -5: 37, -4: 181, -3: 94, -2: 237})

In [41]:
diffPmf = thinkstats2.Pmf(lengthDiffs)
thinkplot.PrePlot(2, cols=2)
thinkplot.SubPlot(1)
thinkplot.Hist(diffHist, label='')
thinkplot.Config(title="Differences (weeks) between first baby and other babies \n born to same mother",
                 xlabel = 'first_preg_lngth - other_preg_lngth (weeks)',
                 ylabel = 'freq')

thinkplot.SubPlot(2)
thinkplot.Hist(diffPmf, label='')
thinkplot.Config(title="Differences (weeks) between first baby and other babies \n born to same mother",
                 xlabel = 'first_preg_lngth - other_preg_lngth (weeks)',
                 ylabel = 'freq')
thinkplot.Show()

Exercise 3.4


In [56]:
pwDiff = defaultdict(list)
for caseid, pregList in pregMap.iteritems():
    first = df[df.index==pregList[0]].prglngth
    first = int(first)
    for i,idx in enumerate(pregList[1:]):
        
        other = df[df.index==idx].prglngth
        other = int(other)
        diff = first - other
 
        pwDiff[i + 1].append(diff)
    
pmf_s = []
for i in range(1,6):
    diff_pmf = thinkstats2.Pmf(pwDiff[i + 1], label='diff to kid num %d' % i)
    pmf_s.append(diff_pmf)


[Pmf({0: 0.5283630470016207, 1: 0.06645056726094004, 2: 0.07779578606158832, 3: 0.03322528363047002, 4: 0.031604538087520256, 5: 0.013776337115072933, 6: 0.006482982171799027, 7: 0.00486223662884927, 8: 0.002431118314424635, 9: 0.006482982171799027, 10: 0.006482982171799027, 11: 0.002431118314424635, 13: 0.0016207455429497568, 14: 0.0008103727714748784, 15: 0.0008103727714748784, 16: 0.0008103727714748784, 17: 0.0008103727714748784, 22: 0.0008103727714748784, 26: 0.0008103727714748784, -2: 0.0486223662884927, -15: 0.0008103727714748784, -13: 0.004051863857374392, -11: 0.0016207455429497568, -10: 0.002431118314424635, -9: 0.01458670988654781, -8: 0.004051863857374392, -7: 0.005672609400324149, -6: 0.008914100486223663, -5: 0.007293354943273905, -4: 0.03889789303079416, -3: 0.02106969205834684, -1: 0.055105348460291734}), Pmf({0: 0.5724465558194775, 1: 0.04750593824228029, 2: 0.06888361045130642, 3: 0.052256532066508314, 4: 0.019002375296912115, 5: 0.011876484560570073, 6: 0.004750593824228029, 7: 0.007125890736342043, 8: 0.0023752969121140144, 9: 0.009501187648456057, 10: 0.004750593824228029, 11: 0.004750593824228029, 15: 0.0023752969121140144, 19: 0.0023752969121140144, 20: 0.0023752969121140144, -13: 0.004750593824228029, -1: 0.052256532066508314, -10: 0.004750593824228029, -9: 0.007125890736342043, -8: 0.009501187648456057, -7: 0.009501187648456057, -6: 0.004750593824228029, -5: 0.007125890736342043, -4: 0.040380047505938245, -3: 0.0166270783847981, -2: 0.030878859857482188}), Pmf({0: 0.5793650793650793, 1: 0.015873015873015872, 2: 0.047619047619047616, 3: 0.03968253968253968, 4: 0.031746031746031744, 5: 0.023809523809523808, 6: 0.023809523809523808, 7: 0.007936507936507936, 8: 0.023809523809523808, -12: 0.007936507936507936, -1: 0.06349206349206349, -9: 0.007936507936507936, -7: 0.007936507936507936, -6: 0.007936507936507936, -4: 0.05555555555555555, -3: 0.031746031746031744, -2: 0.023809523809523808}), Pmf({0: 0.7000000000000001, 1: 0.02, 2: 0.08, 7: 0.02, -12: 0.02, -2: 0.04, -9: 0.02, -5: 0.02, -4: 0.02, -3: 0.02, -1: 0.02, 31: 0.02}), Pmf({0: 0.65, 1: 0.05, 2: 0.15000000000000002, 3: 0.05, 7: 0.05, -4: 0.05})]

In [58]:
thinkplot.Pmfs(pmf_s)
thinkplot.Config(axis=[-10,10,0,1])
thinkplot.Show()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/Users/davidgoldberg/anaconda/lib/python2.7/site-packages/matplotlib/backend_bases.pyc in enter_notify_event(self, guiEvent, xy)
   1950         self._lastx, self._lasty = None, None
   1951 
-> 1952     def enter_notify_event(self, guiEvent=None, xy=None):
   1953         """
   1954         Backend derived classes should call this function when entering

KeyboardInterrupt: 

Exercise 3.4

Write a function called ObservedPmf that takes a Pmf representing the actual distribution of runners' speeds and the speed of the running observer and returns a new PMF representing the distribution of runner's speeds as seen by the observer.


In [63]:
import relay

def ObservedPmf(pmf, runnerSpeed, label):
    new_pmf = pmf.Copy(label=label)
    
    for x,p in pmf.Items():
        diff = abs(runnerSpeed - x)
        #if runner speed is very large wrt x, likely to pass that runner
        #else likely to be passed by that runnner
        #not likely to see those in between.
        new_pmf.Mult(x, diff)
    
    new_pmf.Normalize()
    return new_pmf

results = relay.ReadResults() 
speeds = relay.GetSpeeds(results)
speeds = relay.BinData(speeds, 3, 12, 100)
pmf = thinkstats2.Pmf(speeds, 'unbiased speeds')
thinkplot.PrePlot(2)

thinkplot.Pmf(pmf)

biased_pmf = ObservedPmf(pmf, 7.5, 'biased at 7.5 mph')

thinkplot.Pmf(biased_pmf)
thinkplot.Config(title='PMF of running speed',
                   xlabel='speed (mph)',
                   ylabel='probability')
thinkplot.Show()

In [ ]: