In [1]:
from __future__ import print_function, division
%matplotlib inline
import pandas as pd
import thinkstats2
import thinkplot
In [2]:
# http://schools.nyc.gov/AboutUs/schools/data/classsize/classsize_2014_11_14.htm
df = pd.read_excel('CityLevelDistributionSummaryPreliminary2015.xlsx', skiprows=6)
df
Out[2]:
In [3]:
grouped = df.groupby('GRADE LEVEL')
In [4]:
for name, group in grouped:
print(name)
In [5]:
grade8 = grouped.get_group('08')
In [6]:
size = grade8['CLASS SIZE'].replace(['<15', '>34'], [14, 35]).astype(int)
In [7]:
classes = grade8['NUMBER OF CLASSES']
In [8]:
pmf = thinkstats2.Pmf(dict(zip(size, classes)))
In [9]:
thinkplot.Pmf(pmf)
In [10]:
pmf.Mean()
Out[10]:
In [11]:
def BiasPmf(pmf, label):
new_pmf = pmf.Copy(label=label)
for x, p in pmf.Items():
new_pmf.Mult(x, x)
new_pmf.Normalize()
return new_pmf
In [12]:
biased = BiasPmf(pmf, 'biased')
thinkplot.Pmf(pmf)
thinkplot.Pmf(biased)
In [13]:
biased.Mean()
Out[13]:
In [14]:
# https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html
sizes = [1, 5, 15, 25, 35, 45, 75, 125]
counts = [138, 635, 1788, 1979, 796, 354, 487, 333]
xlim = [-5, 130]
formats=['png', 'pdf']
In [15]:
pmf = thinkstats2.Pmf(dict(zip(sizes, counts)), label='actual')
thinkplot.PrePlot(2)
thinkplot.Hist(pmf)
thinkplot.Config(xlabel='class size', ylabel='PMF', xlim=xlim, loc='upper right')
thinkplot.Save('purdue1', formats=formats)
In [16]:
biased = BiasPmf(pmf, 'biased')
thinkplot.PrePlot(2)
thinkplot.Hist(pmf, align='right')
thinkplot.Hist(biased, color='orange', align='left')
thinkplot.Config(xlabel='class size', ylabel='PMF', xlim=xlim, loc='upper right')
thinkplot.Save('purdue2', formats=formats)
In [17]:
thinkplot.PrePlot(2)
thinkplot.Cdf(pmf.MakeCdf())
thinkplot.Cdf(biased.MakeCdf(), color='orange')
thinkplot.Config(xlabel='class size', ylabel='CDF', xlim=xlim, loc='lower right')
thinkplot.Save('purdue3', formats=formats)
In [18]:
pmf.Mean(), biased.Mean()
Out[18]:
In [18]:
In [18]: