In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LinearRegression
%matplotlib inline
Let's load the data and give it a quick look.
In [3]:
df = pd.read_csv('data/apib12tx.csv')
In [4]:
df.describe()
Out[4]:
API12B
PCT_AA
PCT_AI
PCT_AS
PCT_FI
PCT_HI
PCT_PI
PCT_WH
PCT_MR
MEALS
...
ACS_K3
ACS_46
ACS_CORE
PCT_RESP
NOT_HSG
HSG
SOME_COL
COL_GRAD
GRAD_SCH
AVG_ED
count
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
...
6634.000000
7511.000000
4489.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10118.000000
mean
787.467262
6.504172
1.109257
7.154216
2.149602
49.908216
0.504957
29.520467
2.085108
59.667517
...
22.059090
25.723605
23.432168
85.550702
20.058506
24.698537
24.264651
18.515952
11.794935
2.771169
std
104.176630
10.539728
4.082059
12.321782
4.028955
29.159250
1.031627
26.159148
2.939391
29.600844
...
5.839773
6.902056
7.683286
18.622835
17.679325
12.319831
11.020448
12.452289
13.874110
0.754851
min
317.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
...
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1.000000
25%
739.000000
1.000000
0.000000
1.000000
0.000000
24.000000
0.000000
5.000000
0.000000
36.000000
...
20.000000
24.000000
20.000000
82.000000
5.000000
16.000000
16.000000
8.000000
3.000000
2.180000
50%
797.000000
3.000000
0.000000
3.000000
1.000000
48.000000
0.000000
22.000000
1.000000
64.000000
...
23.000000
28.000000
26.000000
92.000000
16.000000
26.000000
24.000000
16.000000
6.000000
2.670000
75%
857.000000
8.000000
1.000000
8.000000
3.000000
76.000000
1.000000
51.000000
3.000000
85.000000
...
26.000000
30.000000
29.000000
97.000000
32.000000
33.000000
32.000000
28.000000
16.000000
3.290000
max
998.000000
95.000000
92.000000
96.000000
65.000000
100.000000
18.000000
100.000000
23.000000
100.000000
...
43.000000
46.000000
45.000000
100.000000
100.000000
100.000000
100.000000
100.000000
89.000000
4.880000
8 rows × 25 columns
Let's start looking at how variables in our dataset relate to each other so we know what to expect when we start modeling.
In [5]:
df.corr()
Out[5]:
API12B
PCT_AA
PCT_AI
PCT_AS
PCT_FI
PCT_HI
PCT_PI
PCT_WH
PCT_MR
MEALS
...
ACS_K3
ACS_46
ACS_CORE
PCT_RESP
NOT_HSG
HSG
SOME_COL
COL_GRAD
GRAD_SCH
AVG_ED
API12B
1.000000
-0.223117
-0.126489
0.362361
0.153552
-0.418466
-0.060199
0.363066
0.210260
-0.528166
...
0.182052
0.254228
0.437987
0.223457
-0.535325
-0.519386
0.011033
0.591692
0.582731
0.652429
PCT_AA
-0.223117
1.000000
-0.067107
-0.049370
0.057254
-0.092423
0.219737
-0.282255
-0.003058
0.182797
...
-0.000674
-0.070089
-0.036594
-0.178875
0.003424
0.149530
0.134386
-0.097386
-0.151995
-0.098450
PCT_AI
-0.126489
-0.067107
1.000000
-0.088917
-0.080058
-0.187081
-0.029631
0.128484
0.054740
0.040794
...
-0.159973
-0.184140
-0.218363
-0.031626
-0.061002
0.113335
0.145105
-0.075846
-0.075958
-0.030800
PCT_AS
0.362361
-0.049370
-0.088917
1.000000
0.232352
-0.363255
0.071994
-0.077984
0.105744
-0.312039
...
0.126658
0.178142
0.251559
-0.004432
-0.271720
-0.252820
-0.213072
0.326789
0.389559
0.370071
PCT_FI
0.153552
0.057254
-0.080058
0.232352
1.000000
-0.142512
0.282081
-0.129342
0.095510
-0.141283
...
0.089452
0.120321
0.214681
0.068215
-0.204777
-0.083519
0.090039
0.263867
0.031945
0.165709
PCT_HI
-0.418466
-0.092423
-0.187081
-0.363255
-0.142512
1.000000
-0.077148
-0.784308
-0.458183
0.746475
...
0.115690
0.069356
0.048252
-0.084089
0.798176
0.520316
-0.268786
-0.685352
-0.602600
-0.797608
PCT_PI
-0.060199
0.219737
-0.029631
0.071994
0.282081
-0.077148
1.000000
-0.116163
0.036753
0.028549
...
0.075881
0.049283
0.019570
-0.029653
-0.052520
0.083561
0.092990
0.009552
-0.081522
-0.017244
PCT_WH
0.363066
-0.282255
0.128484
-0.077984
-0.129342
-0.784308
-0.116163
1.000000
0.329157
-0.695871
...
-0.176037
-0.131902
-0.126703
0.175599
-0.664536
-0.494585
0.286760
0.580454
0.522910
0.683077
PCT_MR
0.210260
-0.003058
0.054740
0.105744
0.095510
-0.458183
0.036753
0.329157
1.000000
-0.381300
...
0.007579
0.056607
-0.053738
0.132515
-0.408829
-0.295447
0.113340
0.378918
0.348351
0.432389
MEALS
-0.528166
0.182797
0.040794
-0.312039
-0.141283
0.746475
0.028549
-0.695871
-0.381300
1.000000
...
0.017383
-0.035894
-0.095275
-0.164411
0.734150
0.710693
-0.079904
-0.785937
-0.754539
-0.872949
P_GATE
0.300525
0.011117
-0.075187
0.233122
0.128334
-0.154795
0.004900
0.048766
0.067234
-0.201800
...
0.083439
0.143832
0.371522
0.001516
-0.181664
-0.192168
-0.047112
0.224105
0.234954
0.241601
P_MIGED
-0.153190
-0.150501
-0.020439
-0.134536
-0.106317
0.347309
-0.093803
-0.214935
-0.177547
0.263676
...
0.036802
-0.013519
-0.028966
0.056428
0.376072
0.124583
-0.157987
-0.273858
-0.201256
-0.316049
P_EL
-0.261477
-0.079102
-0.158655
-0.062005
-0.038954
0.724659
0.024503
-0.665903
-0.341988
0.636838
...
0.138346
0.109007
0.004258
-0.095159
0.707431
0.413443
-0.398401
-0.536341
-0.447442
-0.654460
P_RFEP
-0.200273
-0.055021
-0.161894
0.122811
0.055542
0.525937
-0.007727
-0.557736
-0.303721
0.397674
...
0.108101
0.111874
0.256516
-0.124148
0.532067
0.245830
-0.336218
-0.356630
-0.292245
-0.457698
P_DI
-0.153766
0.070958
0.048868
-0.066227
0.004943
-0.028171
0.029540
0.023305
0.006032
0.049876
...
-0.301005
-0.287175
-0.207121
-0.165832
-0.020500
0.063822
0.024405
-0.026593
-0.062990
-0.031134
ACS_K3
0.182052
-0.000674
-0.159973
0.126658
0.089452
0.115690
0.075881
-0.176037
0.007579
0.017383
...
1.000000
0.793651
0.699465
0.138815
0.024870
-0.014883
-0.033463
0.005454
0.005000
-0.005821
ACS_46
0.254228
-0.070089
-0.184140
0.178142
0.120321
0.069356
0.049283
-0.131902
0.056607
-0.035894
...
0.793651
1.000000
0.797873
0.159163
-0.031117
-0.045236
-0.051904
0.073514
0.058967
0.056500
ACS_CORE
0.437987
-0.036594
-0.218363
0.251559
0.214681
0.048252
0.019570
-0.126703
-0.053738
-0.095275
...
0.699465
0.797873
1.000000
0.123847
-0.083922
-0.166419
-0.071366
0.202447
0.164005
0.160656
PCT_RESP
0.223457
-0.178875
-0.031626
-0.004432
0.068215
-0.084089
-0.029653
0.175599
0.132515
-0.164411
...
0.138815
0.159163
0.123847
1.000000
-0.119829
-0.130072
0.173556
0.170700
0.200911
0.204639
NOT_HSG
-0.535325
0.003424
-0.061002
-0.271720
-0.204777
0.798176
-0.052520
-0.664536
-0.408829
0.734150
...
0.024870
-0.031117
-0.083922
-0.119829
1.000000
0.430125
-0.441862
-0.746153
-0.580582
-0.879174
HSG
-0.519386
0.149530
0.113335
-0.252820
-0.083519
0.520316
0.083561
-0.494585
-0.295447
0.710693
...
-0.014883
-0.045236
-0.166419
-0.130072
0.430125
1.000000
0.012790
-0.691139
-0.727006
-0.754308
SOME_COL
0.011033
0.134386
0.145105
-0.213072
0.090039
-0.268786
0.092990
0.286760
0.113340
-0.079904
...
-0.033463
-0.051904
-0.071366
0.173556
-0.441862
0.012790
1.000000
0.092047
-0.216354
0.147892
COL_GRAD
0.591692
-0.097386
-0.075846
0.326789
0.263867
-0.685352
0.009552
0.580454
0.378918
-0.785937
...
0.005454
0.073514
0.202447
0.170700
-0.746153
-0.691139
0.092047
1.000000
0.667201
0.884819
GRAD_SCH
0.582731
-0.151995
-0.075958
0.389559
0.031945
-0.602600
-0.081522
0.522910
0.348351
-0.754539
...
0.005000
0.058967
0.164005
0.200911
-0.580582
-0.727006
-0.216354
0.667201
1.000000
0.874638
AVG_ED
0.652429
-0.098450
-0.030800
0.370071
0.165709
-0.797608
-0.017244
0.683077
0.432389
-0.872949
...
-0.005821
0.056500
0.160656
0.204639
-0.879174
-0.754308
0.147892
0.884819
0.874638
1.000000
25 rows × 25 columns
The percentage of students enrolled in free/reduced-price lunch programs is often used as a proxy for poverty.
In [6]:
df.plot(kind="scatter", x="MEALS", y="API12B")
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x113e73a90>
Conversely, the education level of a student's parents is often a good predictor of how well a student will do in school.
In [7]:
df.plot(kind="scatter", x="AVG_ED", y="API12B")
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1144dded0>
Like we did last week, we'll use scikit-learn to run basic single-variable regressions. Let's start by looking at California's Academic Performance index as it relates to the percentage of students, per school, enrolled in free/reduced-price lunch programs.
In [8]:
data = np.asarray(df[['API12B','MEALS']])
data = Imputer().fit_transform(data)
x, y = data[:, 1:], data[:, 0]
In [9]:
lr = LinearRegression()
lr.fit(x, y)
Out[9]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [10]:
# plot the linear regression line on the scatter plot
lr.coef_
Out[10]:
array([-1.85881873])
In [11]:
lr.score(x, y)
Out[11]:
0.27895982917584949
In [12]:
plt.scatter(x, y, color='blue')
plt.plot(x, lr.predict(x), color='red', linewidth=1)
Out[12]:
[<matplotlib.lines.Line2D at 0x114bbf310>]
In our naive universe where we're only paying attention to two variables -- academic performance and free/reduced lunch -- we can clearly see that some percentage of schools is overperforming the performance that would be expected of them, taking poverty out of the equation.
A handful, in particular, seem to be dramatically overperforming. Let's look at them:
In [32]:
df[(df['MEALS'] >= 80) & (df['API12B'] >= 90)]
Out[32]:
CHARTER
SNAME
DNAME
CNAME
API12B
ST_RANK
PCT_AA
PCT_AI
PCT_AS
PCT_FI
...
ACS_K3
ACS_46
ACS_CORE
PCT_RESP
NOT_HSG
HSG
SOME_COL
COL_GRAD
GRAD_SCH
AVG_ED
15
NaN
Chin (John Yehall) Elementary
San Francisco Unified
San Francisco
989
10
1
0
88
1
...
21
27
NaN
96
17
47
11
22
3
2.47
51
D
American Indian Public Charter School II
Oakland Unified
Alameda
978
10
5
1
84
0
...
NaN
2
NaN
14
40
14
26
21
0
2.28
161
D
KIPP Raices Academy
Los Angeles Unified
Los Angeles
961
10
2
0
0
0
...
NaN
NaN
NaN
93
14
36
35
8
7
2.59
207
NaN
Lincoln Elementary
Oakland Unified
Alameda
957
10
3
0
87
2
...
24
30
NaN
46
19
37
18
17
8
2.59
221
D
Oakland Charter High
Oakland Unified
Alameda
956
10
5
0
29
0
...
NaN
NaN
5
70
44
35
15
6
0
1.83
445
NaN
Richardson Prep Hi
San Bernardino City Unified
San Bernardino
939
10
11
1
2
1
...
NaN
32
29
76
26
28
22
13
11
2.54
510
D
Synergy Charter Academy
Los Angeles Unified
Los Angeles
935
10
4
0
0
0
...
NaN
NaN
NaN
77
40
28
24
7
1
2.03
542
Y
Sixth Street Prep
Victor Elementary
San Bernardino
933
10
8
1
1
0
...
22
34
NaN
92
36
23
20
9
13
2.38
683
D
KIPP Los Angeles College Preparatory
Los Angeles Unified
Los Angeles
925
10
1
0
2
0
...
NaN
NaN
NaN
99
28
37
27
4
4
2.20
702
D
Rocketship Mateo Sheedy Elementary
Santa Clara County Office of E
Santa Clara
924
10
1
0
1
0
...
24
25
NaN
91
45
24
14
11
7
2.10
739
NaN
Solano Avenue Elementary
Los Angeles Unified
Los Angeles
922
10
4
1
50
2
...
19
17
NaN
89
19
26
28
20
7
2.70
788
NaN
Laurel Street Elementary
Compton Unified
Los Angeles
920
9
20
0
0
0
...
28
31
NaN
90
46
31
16
5
1
1.84
865
NaN
Middle College High
San Bernardino City Unified
San Bernardino
916
10
9
1
1
2
...
NaN
NaN
22
85
22
29
30
14
5
2.51
939
NaN
Design Science Early College High
Fresno Unified
Fresno
913
10
7
0
18
0
...
NaN
NaN
29
95
18
31
35
8
8
2.56
964
NaN
Frank Lanterman
Los Angeles Unified
Los Angeles
912
C
12
0
2
4
...
NaN
NaN
11
49
31
31
18
16
4
2.31
1054
D
Futuro College Preparatory Elementary
Los Angeles Unified
Los Angeles
908
9
0
0
0
0
...
32
NaN
NaN
98
28
29
31
8
5
2.32
1071
D
KIPP Heartwood Academy
Alum Rock Union Elementary
Santa Clara
907
9
2
0
15
2
...
NaN
34
31
91
39
26
19
12
4
2.16
1354
NaN
Dr. J. Michael McGrath Elementary
Newhall
Los Angeles
896
9
6
0
2
2
...
26
30
NaN
91
33
31
25
8
4
2.19
1376
D
Celerity Octavia Charter
Los Angeles Unified
Los Angeles
895
9
0
0
4
2
...
21
22
NaN
96
37
35
12
10
5
2.11
1407
D
Milagro Charter
Los Angeles Unified
Los Angeles
894
9
2
0
1
1
...
24
24
NaN
95
20
30
35
10
5
2.51
1408
NaN
Lemay Street Elementary
Los Angeles Unified
Los Angeles
894
9
7
0
2
2
...
16
17
NaN
92
30
36
19
13
3
2.23
1435
NaN
Millard McCollam Elementary
Alum Rock Union Elementary
Santa Clara
893
9
2
0
35
8
...
19
24
NaN
95
15
30
22
30
2
2.74
1439
D
Preuss School UCSD
San Diego Unified
San Diego
893
10
9
0
16
1
...
NaN
1
6
97
37
40
19
3
1
1.90
1473
D
Gabriella Charter
Los Angeles Unified
Los Angeles
892
9
4
0
4
3
...
22
25
25
95
35
26
24
11
4
2.22
1497
D
Celerity Nascent Charter
Los Angeles Unified
Los Angeles
891
9
57
0
0
0
...
23
29
19
96
31
52
13
4
1
1.92
1517
NaN
Montebello Gardens Elementary
Montebello Unified
Los Angeles
890
9
0
0
0
0
...
30
30
NaN
79
17
42
19
15
7
2.52
1586
D
Celerity Dyad Charter
Los Angeles Unified
Los Angeles
888
8
6
0
0
0
...
23
29
28
96
71
23
5
1
0
1.37
1618
D
SIATech
Vista Unified
San Diego
887
B
19
0
1
0
...
NaN
NaN
NaN
72
43
24
19
11
3
2.05
1646
NaN
Santa Fe Elementary
Baldwin Park Unified
Los Angeles
886
8
1
0
7
5
...
20
30
27
74
27
40
25
6
2
2.17
1671
D
Valor Academy Charter
Los Angeles Unified
Los Angeles
885
9
4
0
0
2
...
NaN
NaN
NaN
0
0
0
0
0
0
NaN
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
10117
NaN
Central High
Los Angeles Unified
Los Angeles
427
B
11
2
0
1
...
NaN
NaN
16
61
48
30
12
6
3
1.85
10119
Y
Moreno Valley Community Learning Center
Moreno Valley Unified
Riverside
426
B
26
0
1
2
...
NaN
9
19
86
25
34
32
5
4
2.28
10121
NaN
Orange Grove High
Corona-Norco Unified
Riverside
425
B
3
1
1
0
...
NaN
NaN
12
89
40
27
28
3
2
2.00
10131
NaN
San Bernardino County Juvenile Detention
San Bernardino County Office o
San Bernardino
419
B
26
0
1
0
...
NaN
NaN
NaN
86
34
42
15
6
3
2.02
10133
NaN
Steps Community Day
Jurupa Unified
Riverside
418
B
2
0
0
0
...
NaN
NaN
8
88
45
32
16
8
0
1.87
10135
NaN
Alameda County Juvenile Hall/Court
Alameda County Office of Educa
Alameda
415
B
49
0
2
0
...
NaN
NaN
10
64
25
57
11
6
2
2.03
10136
NaN
Boyle Heights Continuation
Los Angeles Unified
Los Angeles
415
B
0
0
0
0
...
NaN
NaN
20
40
44
44
0
8
4
1.84
10141
NaN
Ramon Alternative Center
Palm Springs Unified
Riverside
414
B
9
0
0
1
...
NaN
NaN
9
90
28
45
21
5
1
2.06
10146
NaN
Community Day/Alternative Education/Spec
Anaheim Union High
Orange
405
B
4
0
2
5
...
NaN
NaN
18
23
37
26
32
5
0
2.05
10149
NaN
Kirby, Dorothy Camp
Los Angeles County Office of E
Los Angeles
403
1
22
0
0
0
...
NaN
NaN
NaN
57
16
39
13
26
6
2.68
10151
NaN
Mendocino County Community
Mendocino County Office of Edu
Mendocino
403
B
2
13
1
1
...
NaN
NaN
12
77
31
28
36
5
0
2.15
10155
NaN
John J. Cairns Continuation
Lindsay Unified
Tulare
400
B
0
0
0
0
...
NaN
NaN
17
93
63
23
14
0
0
1.52
10156
NaN
San Pasqual Vocational Academy
San Pasqual Valley Unified
Imperial
396
1
0
85
0
0
...
NaN
NaN
16
46
0
83
17
0
0
2.17
10157
NaN
Enterprise Secondary
Madera County Office of Educat
Madera
396
B
6
0
0
0
...
NaN
NaN
NaN
94
58
27
10
2
3
1.65
10158
NaN
Jack London Community Day
Los Angeles Unified
Los Angeles
396
1
5
0
1
0
...
NaN
NaN
14
64
69
21
10
0
0
1.40
10159
NaN
Valley Los Banos Community Day
Merced County Office of Educat
Merced
395
B
2
1
1
0
...
NaN
NaN
20
96
33
37
23
4
3
2.08
10160
NaN
Blue Heron
Konocti Unified
Lake
395
B
2
7
0
0
...
NaN
NaN
26
96
33
53
14
0
0
1.81
10161
NaN
NaN
Mendocino County Office of Edu
Mendocino
394
B
2
15
1
1
...
7
NaN
11
79
26
32
34
9
0
2.25
10162
NaN
North Region Community School of Greater
San Diego County Office of Edu
San Diego
392
1
5
0
0
0
...
NaN
NaN
NaN
100
49
29
11
8
2
1.85
10163
NaN
Tri-C Community Day
Los Angeles Unified
Los Angeles
387
B
33
1
1
0
...
NaN
NaN
13
51
50
36
12
1
1
1.68
10166
NaN
Stockton Intermediate
Stockton Unified
San Joaquin
381
1
24
9
2
0
...
NaN
9
10
88
38
41
16
3
2
1.90
10167
NaN
Valley Atwater Community Day
Merced County Office of Educat
Merced
381
B
9
1
3
0
...
NaN
NaN
17
76
50
32
14
1
3
1.74
10168
NaN
Evergreen Continuation
Los Angeles Unified
Los Angeles
378
B
6
1
0
0
...
NaN
NaN
13
52
43
35
13
9
0
1.87
10172
NaN
Crossroads Community Day
Lancaster Elementary
Los Angeles
368
B
52
0
0
0
...
7
8
10
98
36
33
26
5
0
2.00
10175
NaN
Mountain View High (Continuation)
Mojave Unified
Kern
358
B
63
0
0
0
...
NaN
NaN
23
95
28
39
22
6
6
2.22
10177
NaN
Lewis Opportunity
Santa Rosa City Schools
Sonoma
354
1
5
16
0
0
...
NaN
NaN
4
60
42
33
18
2
4
1.93
10181
NaN
Central Juvenile Hall
Los Angeles County Office of E
Los Angeles
339
1
29
1
1
0
...
NaN
NaN
NaN
56
26
42
9
22
1
2.28
10182
Y
Charter Alternatives Academy
Visalia Unified
Tulare
337
B
6
1
3
0
...
NaN
NaN
23
93
32
36
21
8
2
2.12
10183
NaN
NaN
Colusa County Office of Educat
Colusa
336
B
0
3
0
0
...
NaN
NaN
10
91
48
23
16
13
0
1.94
10184
NaN
Esperanza High
Cutler-Orosi Joint Unified
Tulare
330
1
0
0
0
0
...
4
NaN
13
79
64
30
3
3
0
1.45
3324 rows × 30 columns
Let's look specifically at Solano Avenue Elementary, which has an API of 922 and 80 percent of students being in the free/reduced lunch program. If you were to use the above regression to predict how well Solano would do, it would look like this:
In [33]:
lr.predict(80)
Out[33]:
array([ 749.6728627])
With an index of 922, clearly the school is overperforming what our simplified model expects.
Content source: datapolitan/lede_algorithms
Similar notebooks: