In [16]:
import math
import numpy as np
import nsfg
import first
import thinkstats2
import thinkplot
from collections import defaultdict
probability mass function - maps each value to its probability. Alows you to compare two distributions independently from sample size.
probability - frequency expressed as a fraction of the sample size, n.
normalization - dividing frequencies by n.
given a Hist, we can make a dictionary that maps each value to its probability:
n = hist.Total()
d = {}
or x, freq in hist.Items():
d[x] = freq/n
In [2]:
import thinkstats2
pmf = thinkstats2.Pmf([1,2,2,3,5])
#getting pmf values
print pmf.Items()
print pmf.Values()
print pmf.Prob(2)
print pmf[2]
#modifying pmf values
pmf.Incr(2, 0.2)
print pmf.Prob(2)
pmf.Mult(2, 0.5)
print pmf.Prob(2)
#if you modify, probabilities may no longer add up to 1
#to check:
print pmf.Total()
print pmf.Normalize()
print pmf.Total()
#Copy method is also available
To plot a PMF:
thinkplot.Hist
thinkplot.Pmf
--for use when large number of smooth values.
In [3]:
from probability import *
live, firsts, others = first.MakeFrames()
first_pmf = thinkstats2.Pmf(firsts.prglngth, label="firsts")
other_pmf = thinkstats2.Pmf(others.prglngth, label="others")
width = 0.45
#cols option makes grid of figures.
thinkplot.PrePlot(2, cols=2)
thinkplot.Hist(first_pmf, align='right', width=width)
thinkplot.Hist(other_pmf, align='left', width=width)
thinkplot.Config(xlabel='weeks',
ylabel='probability',
axis=[27,46,0,0.6])
#second call to preplot resets the color generator
thinkplot.PrePlot(2)
thinkplot.SubPlot(2)
thinkplot.Pmfs([first_pmf, other_pmf])
thinkplot.Config(xlabel='weeks',
ylabel='probability',
axis=[27,46,0,0.6])
thinkplot.Show()
Good idea to zoom in on the mode, where the biggest differences occur:
In [4]:
weeks = range(35, 46)
diffs = []
for week in weeks:
p1 = first_pmf.Prob(week)
p2 = other_pmf.Prob(week)
#diff between two points in percentage points
diff = 100 * (p1 - p2)
diffs.append(diff)
thinkplot.Bar(weeks, diffs)
thinkplot.Config(title="Difference in PMFs",
xlabel="weeks",
ylabel="percentage points")
thinkplot.Show()
In [6]:
d = {7:8, 12:8, 17:14, 22:4, 27:6,
32:12, 37:8, 42:3, 47:2}
pmf = thinkstats2.Pmf(d, label='actual')
print ('mean', pmf.Mean())
For each class size, x, in the following funtion, we multiply the probability by x, the number of students who observe that class size. This gives a biased distribution
In [11]:
def BiasPmf(pmf, label):
new_pmf = pmf.Copy(label=label)
for x, p in pmf.Items():
new_pmf.Mult(x, x)
new_pmf.Normalize()
return new_pmf
thinkplot.PrePlot(2)
biased_pmf = BiasPmf(pmf, label="observed")
thinkplot.Pmfs([pmf, biased_pmf])
thinkplot.Config(root='class_size1',
xlabel='class size',
ylabel='PMF',
axis=[0, 52, 0, 0.27])
# thinkplot.Show()
print "actual mean", pmf.Mean()
print "biased mean", biased_pmf.Mean()
Conclusion: the students are biased because the amount of students in a large class is large, so students who are taking multiple classes are likely taking at least one of these classes, which offsets their personal average class size from the actual.
Think of it this way: if you had one of each class size in range of class sizes from 1 to 10, the average size of the classes would be 5, but far more people would report being in a larger class than being in a smaller class.
this can be corrected, however...
In [14]:
def UnbiasPmf(pmf, label):
new_pmf = pmf.Copy(label=label)
for x, p in pmf.Items():
new_pmf.Mult(x, 1.0 / x)
new_pmf.Normalize()
return new_pmf
print 'unbiased mean:', UnbiasPmf(biased_pmf, "unbiased").Mean()
In [19]:
import numpy as np
import pandas
array = np.random.randn(4,2)
df = pandas.DataFrame(array)
df
Out[19]:
In [21]:
columns = ['A','B']
df = pandas.DataFrame(array, columns=columns)
df
Out[21]:
In [25]:
index = ['a','b','c','d']
df = pandas.DataFrame(array, columns=columns, index=index)
df
Out[25]:
In [26]:
#to select a row by label, use loc,
#which returns a series
df.loc['a']
Out[26]:
In [27]:
#iloc finds a row by integer position of the row
df.iloc[0]
Out[27]:
In [28]:
#loc can also take a list of labels
#in this case it returns a df
indices = ['a','c']
df.loc[indices]
Out[28]:
In [32]:
#slicing
#NOTE: first slice method selects inclusively
print df['a':'c']
df[0:2]
Out[32]:
PMFs can be used to calculate probability:
$$
\bar{x} = \sum_{i}p_ix_i
$$
where $x_i$ are the unique values in the PMF and $p_i = PMF(x_i)$
Variance can also be calulated:
$$ S^2 = \sum_{i}p_i(x_i -\bar{x})^2 $$Write functions PmfMean
and PmfVar
that take a Pmf object and compute the mean and variance.
In [37]:
def PmfMean(pmf):
mean = 0
for key, prob in pmf.Items():
mean += key * prob
return mean
def PmfVar(pmf):
mean = PmfMean(pmf)
var = 0
for key, prob in pmf.Items():
var += prob * (key - mean) ** 2
return var
print "my Mean:", PmfMean(pmf)
print "answer mean:", pmf.Mean()
print "my Variance:", PmfVar(pmf)
print "answer variance:", pmf.Var()
In [6]:
df = nsfg.ReadFemPreg()
pregMap = nsfg.MakePregMap(df[df.outcome==1])
In [29]:
lengthDiffs = []
for caseid, pregList in pregMap.iteritems():
first = df[df.index==pregList[0]].prglngth
first = int(first)
for idx in pregList[1:]:
other = df[df.index==idx].prglngth
other = int(other)
diff = first - other
lengthDiffs.append(diff)
diffHist = thinkstats2.Hist(lengthDiffs)
print diffHist
In [41]:
diffPmf = thinkstats2.Pmf(lengthDiffs)
thinkplot.PrePlot(2, cols=2)
thinkplot.SubPlot(1)
thinkplot.Hist(diffHist, label='')
thinkplot.Config(title="Differences (weeks) between first baby and other babies \n born to same mother",
xlabel = 'first_preg_lngth - other_preg_lngth (weeks)',
ylabel = 'freq')
thinkplot.SubPlot(2)
thinkplot.Hist(diffPmf, label='')
thinkplot.Config(title="Differences (weeks) between first baby and other babies \n born to same mother",
xlabel = 'first_preg_lngth - other_preg_lngth (weeks)',
ylabel = 'freq')
thinkplot.Show()
In [56]:
pwDiff = defaultdict(list)
for caseid, pregList in pregMap.iteritems():
first = df[df.index==pregList[0]].prglngth
first = int(first)
for i,idx in enumerate(pregList[1:]):
other = df[df.index==idx].prglngth
other = int(other)
diff = first - other
pwDiff[i + 1].append(diff)
pmf_s = []
for i in range(1,6):
diff_pmf = thinkstats2.Pmf(pwDiff[i + 1], label='diff to kid num %d' % i)
pmf_s.append(diff_pmf)
In [58]:
thinkplot.Pmfs(pmf_s)
thinkplot.Config(axis=[-10,10,0,1])
thinkplot.Show()
In [63]:
import relay
def ObservedPmf(pmf, runnerSpeed, label):
new_pmf = pmf.Copy(label=label)
for x,p in pmf.Items():
diff = abs(runnerSpeed - x)
#if runner speed is very large wrt x, likely to pass that runner
#else likely to be passed by that runnner
#not likely to see those in between.
new_pmf.Mult(x, diff)
new_pmf.Normalize()
return new_pmf
results = relay.ReadResults()
speeds = relay.GetSpeeds(results)
speeds = relay.BinData(speeds, 3, 12, 100)
pmf = thinkstats2.Pmf(speeds, 'unbiased speeds')
thinkplot.PrePlot(2)
thinkplot.Pmf(pmf)
biased_pmf = ObservedPmf(pmf, 7.5, 'biased at 7.5 mph')
thinkplot.Pmf(biased_pmf)
thinkplot.Config(title='PMF of running speed',
xlabel='speed (mph)',
ylabel='probability')
thinkplot.Show()
In [ ]: