In [1]:
from __future__ import print_function, division

%matplotlib inline

import pandas as pd
import thinkstats2
import thinkplot

In [2]:
# http://schools.nyc.gov/AboutUs/schools/data/classsize/classsize_2014_11_14.htm
df = pd.read_excel('CityLevelDistributionSummaryPreliminary2015.xlsx', skiprows=6)
df


Out[2]:
GRADE LEVEL PROGRAM TYPE CORE SUBJECT (MS CORE and 09-12 ONLY) SERVICE CATEGORY (0K-08 ONLY) CLASS SIZE NUMBER OF CLASSES NUMBER OF STUDENTS PERCENT OF STUDENTS IN BOROUGH / PROGRAM / GRADE / SUBJECT
0 0K GEN ED / ICT / G&T - - <15 86 1002 0.014110
1 0K GEN ED / ICT / G&T - - 15 29 435 0.006126
2 0K GEN ED / ICT / G&T - - 16 35 560 0.007886
3 0K GEN ED / ICT / G&T - - 17 59 1003 0.014124
4 0K GEN ED / ICT / G&T - - 18 95 1710 0.024080
5 0K GEN ED / ICT / G&T - - 19 98 1862 0.026221
6 0K GEN ED / ICT / G&T - - 20 177 3540 0.049851
7 0K GEN ED / ICT / G&T - - 21 232 4872 0.068608
8 0K GEN ED / ICT / G&T - - 22 298 6556 0.092322
9 0K GEN ED / ICT / G&T - - 23 364 8372 0.117896
10 0K GEN ED / ICT / G&T - - 24 444 10656 0.150059
11 0K GEN ED / ICT / G&T - - 25 865 21625 0.304526
12 0K GEN ED / ICT / G&T - - 26 191 4966 0.069932
13 0K GEN ED / ICT / G&T - - 27 68 1836 0.025855
14 0K GEN ED / ICT / G&T - - 28 42 1176 0.016561
15 0K GEN ED / ICT / G&T - - 29 10 290 0.004084
16 0K GEN ED / ICT / G&T - - 30 12 360 0.005070
17 0K GEN ED / ICT / G&T - - 31 3 93 0.001310
18 0K GEN ED / ICT / G&T - - 32 1 32 0.000451
19 0K GEN ED / ICT / G&T - - 33 2 66 0.000929
20 01 GEN ED / ICT / G&T - - <15 46 582 0.008027
21 01 GEN ED / ICT / G&T - - 15 25 375 0.005172
22 01 GEN ED / ICT / G&T - - 16 37 592 0.008165
23 01 GEN ED / ICT / G&T - - 17 35 595 0.008206
24 01 GEN ED / ICT / G&T - - 18 41 738 0.010178
25 01 GEN ED / ICT / G&T - - 19 75 1425 0.019653
26 01 GEN ED / ICT / G&T - - 20 107 2140 0.029514
27 01 GEN ED / ICT / G&T - - 21 147 3087 0.042575
28 01 GEN ED / ICT / G&T - - 22 198 4356 0.060077
29 01 GEN ED / ICT / G&T - - 23 210 4830 0.066614
... ... ... ... ... ... ... ... ...
474 09-12 SPEC ED Math - 11 55 605 0.087719
475 09-12 SPEC ED Math - 12 52 624 0.090474
476 09-12 SPEC ED Math - 13 67 871 0.126287
477 09-12 SPEC ED Math - 14 80 1120 0.162389
478 09-12 SPEC ED Math - 15 101 1515 0.219661
479 09-12 SPEC ED Math - >15 41 685 0.099319
480 09-12 SPEC ED Science - <6 18 90 0.015169
481 09-12 SPEC ED Science - 6 33 198 0.033373
482 09-12 SPEC ED Science - 7 27 189 0.031856
483 09-12 SPEC ED Science - 8 20 160 0.026968
484 09-12 SPEC ED Science - 9 28 252 0.042474
485 09-12 SPEC ED Science - 10 41 410 0.069105
486 09-12 SPEC ED Science - 11 37 407 0.068599
487 09-12 SPEC ED Science - 12 41 492 0.082926
488 09-12 SPEC ED Science - 13 51 663 0.111748
489 09-12 SPEC ED Science - 14 64 896 0.151020
490 09-12 SPEC ED Science - 15 85 1275 0.214900
491 09-12 SPEC ED Science - >15 53 901 0.151862
492 09-12 SPEC ED Social Studies - <6 32 160 0.022679
493 09-12 SPEC ED Social Studies - 6 39 234 0.033168
494 09-12 SPEC ED Social Studies - 7 50 350 0.049610
495 09-12 SPEC ED Social Studies - 8 35 280 0.039688
496 09-12 SPEC ED Social Studies - 9 60 540 0.076541
497 09-12 SPEC ED Social Studies - 10 42 420 0.059532
498 09-12 SPEC ED Social Studies - 11 55 605 0.085755
499 09-12 SPEC ED Social Studies - 12 59 708 0.100354
500 09-12 SPEC ED Social Studies - 13 66 858 0.121616
501 09-12 SPEC ED Social Studies - 14 60 840 0.119064
502 09-12 SPEC ED Social Studies - 15 93 1395 0.197732
503 09-12 SPEC ED Social Studies - >15 40 665 0.094259

504 rows × 8 columns


In [3]:
grouped = df.groupby('GRADE LEVEL')

In [4]:
for name, group in grouped:
    print(name)


01
02
03
04
05
06
07
08
09-12
0K
0K-09
MS Core

In [5]:
grade8 = grouped.get_group('08')

In [6]:
size = grade8['CLASS SIZE'].replace(['<15', '>34'], [14, 35]).astype(int)

In [7]:
classes = grade8['NUMBER OF CLASSES']

In [8]:
pmf = thinkstats2.Pmf(dict(zip(size, classes)))

In [9]:
thinkplot.Pmf(pmf)



In [10]:
pmf.Mean()


Out[10]:
27.406021505376348

In [11]:
def BiasPmf(pmf, label):
    new_pmf = pmf.Copy(label=label)

    for x, p in pmf.Items():
        new_pmf.Mult(x, x)
        
    new_pmf.Normalize()
    return new_pmf

In [12]:
biased = BiasPmf(pmf, 'biased')
thinkplot.Pmf(pmf)
thinkplot.Pmf(biased)



In [13]:
biased.Mean()


Out[13]:
28.088749038748254

In [14]:
# https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html
sizes = [1, 5, 15, 25, 35, 45, 75, 125]
counts = [138, 635, 1788, 1979, 796, 354, 487, 333]
xlim = [-5, 130]
formats=['png', 'pdf']

In [15]:
pmf = thinkstats2.Pmf(dict(zip(sizes, counts)), label='actual')
thinkplot.PrePlot(2)
thinkplot.Hist(pmf)
thinkplot.Config(xlabel='class size', ylabel='PMF', xlim=xlim, loc='upper right')
thinkplot.Save('purdue1', formats=formats)


Writing purdue1.png
Writing purdue1.pdf
<matplotlib.figure.Figure at 0x7f68eda74690>

In [16]:
biased = BiasPmf(pmf, 'biased')
thinkplot.PrePlot(2)
thinkplot.Hist(pmf, align='right')
thinkplot.Hist(biased, color='orange', align='left')
thinkplot.Config(xlabel='class size', ylabel='PMF', xlim=xlim, loc='upper right')
thinkplot.Save('purdue2', formats=formats)


Writing purdue2.png
Writing purdue2.pdf
<matplotlib.figure.Figure at 0x7f68ed11e350>

In [17]:
thinkplot.PrePlot(2)
thinkplot.Cdf(pmf.MakeCdf())
thinkplot.Cdf(biased.MakeCdf(), color='orange')
thinkplot.Config(xlabel='class size', ylabel='CDF', xlim=xlim, loc='lower right')
thinkplot.Save('purdue3', formats=formats)


Writing purdue3.png
Writing purdue3.pdf
<matplotlib.figure.Figure at 0x7f68edb981d0>

In [18]:
pmf.Mean(), biased.Mean()


Out[18]:
(30.959754224270352, 56.01463671185027)

In [18]:


In [18]: