In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
Let's load the data and give it a quick look.
In [3]:
df = pd.read_csv('data/apib12tx.csv')
In [4]:
df.describe()
Out[4]:
API12B
PCT_AA
PCT_AI
PCT_AS
PCT_FI
PCT_HI
PCT_PI
PCT_WH
PCT_MR
MEALS
...
ACS_K3
ACS_46
ACS_CORE
PCT_RESP
NOT_HSG
HSG
SOME_COL
COL_GRAD
GRAD_SCH
AVG_ED
count
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
...
6634.000000
7511.000000
4489.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10187.000000
10118.000000
mean
787.467262
6.504172
1.109257
7.154216
2.149602
49.908216
0.504957
29.520467
2.085108
59.667517
...
22.059090
25.723605
23.432168
85.550702
20.058506
24.698537
24.264651
18.515952
11.794935
2.771169
std
104.176630
10.539728
4.082059
12.321782
4.028955
29.159250
1.031627
26.159148
2.939391
29.600844
...
5.839773
6.902056
7.683286
18.622835
17.679325
12.319831
11.020448
12.452289
13.874110
0.754851
min
317.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
...
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1.000000
25%
739.000000
1.000000
0.000000
1.000000
0.000000
24.000000
0.000000
5.000000
0.000000
36.000000
...
20.000000
24.000000
20.000000
82.000000
5.000000
16.000000
16.000000
8.000000
3.000000
2.180000
50%
797.000000
3.000000
0.000000
3.000000
1.000000
48.000000
0.000000
22.000000
1.000000
64.000000
...
23.000000
28.000000
26.000000
92.000000
16.000000
26.000000
24.000000
16.000000
6.000000
2.670000
75%
857.000000
8.000000
1.000000
8.000000
3.000000
76.000000
1.000000
51.000000
3.000000
85.000000
...
26.000000
30.000000
29.000000
97.000000
32.000000
33.000000
32.000000
28.000000
16.000000
3.290000
max
998.000000
95.000000
92.000000
96.000000
65.000000
100.000000
18.000000
100.000000
23.000000
100.000000
...
43.000000
46.000000
45.000000
100.000000
100.000000
100.000000
100.000000
100.000000
89.000000
4.880000
8 rows × 25 columns
Let's start looking at how variables in our dataset relate to each other so we know what to expect when we start modeling.
In [5]:
df.corr()
Out[5]:
API12B
PCT_AA
PCT_AI
PCT_AS
PCT_FI
PCT_HI
PCT_PI
PCT_WH
PCT_MR
MEALS
...
ACS_K3
ACS_46
ACS_CORE
PCT_RESP
NOT_HSG
HSG
SOME_COL
COL_GRAD
GRAD_SCH
AVG_ED
API12B
1.000000
-0.223117
-0.126489
0.362361
0.153552
-0.418466
-0.060199
0.363066
0.210260
-0.528166
...
0.182052
0.254228
0.437987
0.223457
-0.535325
-0.519386
0.011033
0.591692
0.582731
0.652429
PCT_AA
-0.223117
1.000000
-0.067107
-0.049370
0.057254
-0.092423
0.219737
-0.282255
-0.003058
0.182797
...
-0.000674
-0.070089
-0.036594
-0.178875
0.003424
0.149530
0.134386
-0.097386
-0.151995
-0.098450
PCT_AI
-0.126489
-0.067107
1.000000
-0.088917
-0.080058
-0.187081
-0.029631
0.128484
0.054740
0.040794
...
-0.159973
-0.184140
-0.218363
-0.031626
-0.061002
0.113335
0.145105
-0.075846
-0.075958
-0.030800
PCT_AS
0.362361
-0.049370
-0.088917
1.000000
0.232352
-0.363255
0.071994
-0.077984
0.105744
-0.312039
...
0.126658
0.178142
0.251559
-0.004432
-0.271720
-0.252820
-0.213072
0.326789
0.389559
0.370071
PCT_FI
0.153552
0.057254
-0.080058
0.232352
1.000000
-0.142512
0.282081
-0.129342
0.095510
-0.141283
...
0.089452
0.120321
0.214681
0.068215
-0.204777
-0.083519
0.090039
0.263867
0.031945
0.165709
PCT_HI
-0.418466
-0.092423
-0.187081
-0.363255
-0.142512
1.000000
-0.077148
-0.784308
-0.458183
0.746475
...
0.115690
0.069356
0.048252
-0.084089
0.798176
0.520316
-0.268786
-0.685352
-0.602600
-0.797608
PCT_PI
-0.060199
0.219737
-0.029631
0.071994
0.282081
-0.077148
1.000000
-0.116163
0.036753
0.028549
...
0.075881
0.049283
0.019570
-0.029653
-0.052520
0.083561
0.092990
0.009552
-0.081522
-0.017244
PCT_WH
0.363066
-0.282255
0.128484
-0.077984
-0.129342
-0.784308
-0.116163
1.000000
0.329157
-0.695871
...
-0.176037
-0.131902
-0.126703
0.175599
-0.664536
-0.494585
0.286760
0.580454
0.522910
0.683077
PCT_MR
0.210260
-0.003058
0.054740
0.105744
0.095510
-0.458183
0.036753
0.329157
1.000000
-0.381300
...
0.007579
0.056607
-0.053738
0.132515
-0.408829
-0.295447
0.113340
0.378918
0.348351
0.432389
MEALS
-0.528166
0.182797
0.040794
-0.312039
-0.141283
0.746475
0.028549
-0.695871
-0.381300
1.000000
...
0.017383
-0.035894
-0.095275
-0.164411
0.734150
0.710693
-0.079904
-0.785937
-0.754539
-0.872949
P_GATE
0.300525
0.011117
-0.075187
0.233122
0.128334
-0.154795
0.004900
0.048766
0.067234
-0.201800
...
0.083439
0.143832
0.371522
0.001516
-0.181664
-0.192168
-0.047112
0.224105
0.234954
0.241601
P_MIGED
-0.153190
-0.150501
-0.020439
-0.134536
-0.106317
0.347309
-0.093803
-0.214935
-0.177547
0.263676
...
0.036802
-0.013519
-0.028966
0.056428
0.376072
0.124583
-0.157987
-0.273858
-0.201256
-0.316049
P_EL
-0.261477
-0.079102
-0.158655
-0.062005
-0.038954
0.724659
0.024503
-0.665903
-0.341988
0.636838
...
0.138346
0.109007
0.004258
-0.095159
0.707431
0.413443
-0.398401
-0.536341
-0.447442
-0.654460
P_RFEP
-0.200273
-0.055021
-0.161894
0.122811
0.055542
0.525937
-0.007727
-0.557736
-0.303721
0.397674
...
0.108101
0.111874
0.256516
-0.124148
0.532067
0.245830
-0.336218
-0.356630
-0.292245
-0.457698
P_DI
-0.153766
0.070958
0.048868
-0.066227
0.004943
-0.028171
0.029540
0.023305
0.006032
0.049876
...
-0.301005
-0.287175
-0.207121
-0.165832
-0.020500
0.063822
0.024405
-0.026593
-0.062990
-0.031134
ACS_K3
0.182052
-0.000674
-0.159973
0.126658
0.089452
0.115690
0.075881
-0.176037
0.007579
0.017383
...
1.000000
0.793651
0.699465
0.138815
0.024870
-0.014883
-0.033463
0.005454
0.005000
-0.005821
ACS_46
0.254228
-0.070089
-0.184140
0.178142
0.120321
0.069356
0.049283
-0.131902
0.056607
-0.035894
...
0.793651
1.000000
0.797873
0.159163
-0.031117
-0.045236
-0.051904
0.073514
0.058967
0.056500
ACS_CORE
0.437987
-0.036594
-0.218363
0.251559
0.214681
0.048252
0.019570
-0.126703
-0.053738
-0.095275
...
0.699465
0.797873
1.000000
0.123847
-0.083922
-0.166419
-0.071366
0.202447
0.164005
0.160656
PCT_RESP
0.223457
-0.178875
-0.031626
-0.004432
0.068215
-0.084089
-0.029653
0.175599
0.132515
-0.164411
...
0.138815
0.159163
0.123847
1.000000
-0.119829
-0.130072
0.173556
0.170700
0.200911
0.204639
NOT_HSG
-0.535325
0.003424
-0.061002
-0.271720
-0.204777
0.798176
-0.052520
-0.664536
-0.408829
0.734150
...
0.024870
-0.031117
-0.083922
-0.119829
1.000000
0.430125
-0.441862
-0.746153
-0.580582
-0.879174
HSG
-0.519386
0.149530
0.113335
-0.252820
-0.083519
0.520316
0.083561
-0.494585
-0.295447
0.710693
...
-0.014883
-0.045236
-0.166419
-0.130072
0.430125
1.000000
0.012790
-0.691139
-0.727006
-0.754308
SOME_COL
0.011033
0.134386
0.145105
-0.213072
0.090039
-0.268786
0.092990
0.286760
0.113340
-0.079904
...
-0.033463
-0.051904
-0.071366
0.173556
-0.441862
0.012790
1.000000
0.092047
-0.216354
0.147892
COL_GRAD
0.591692
-0.097386
-0.075846
0.326789
0.263867
-0.685352
0.009552
0.580454
0.378918
-0.785937
...
0.005454
0.073514
0.202447
0.170700
-0.746153
-0.691139
0.092047
1.000000
0.667201
0.884819
GRAD_SCH
0.582731
-0.151995
-0.075958
0.389559
0.031945
-0.602600
-0.081522
0.522910
0.348351
-0.754539
...
0.005000
0.058967
0.164005
0.200911
-0.580582
-0.727006
-0.216354
0.667201
1.000000
0.874638
AVG_ED
0.652429
-0.098450
-0.030800
0.370071
0.165709
-0.797608
-0.017244
0.683077
0.432389
-0.872949
...
-0.005821
0.056500
0.160656
0.204639
-0.879174
-0.754308
0.147892
0.884819
0.874638
1.000000
25 rows × 25 columns
The percentage of students enrolled in free/reduced-price lunch programs is often used as a proxy for poverty.
In [6]:
df.plot(kind="scatter", x="MEALS", y="API12B")
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x103f3c6d0>
Conversely, the education level of a student's parents is often a good predictor of how well a student will do in school.
In [7]:
df.plot(kind="scatter", x="AVG_ED", y="API12B")
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1084ec1d0>
Like we did last week, we'll use scikit-learn to run basic single-variable regressions. Let's start by looking at California's Academic Performance index as it relates to the percentage of students, per school, enrolled in free/reduced-price lunch programs.
In [15]:
data = np.asarray(df[['API12B','MEALS']])
x, y = data[:, 1:], data[:, 0]
In [16]:
lr = LinearRegression()
lr.fit(x, y)
Out[16]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [19]:
# plot the linear regression line on the scatter plot
lr.coef_
Out[19]:
array([-1.85881873])
In [20]:
lr.score(x, y)
Out[20]:
0.27895982917584949
In [21]:
plt.scatter(x, y, color='blue')
plt.plot(x, lr.predict(x), color='red', linewidth=1)
Out[21]:
[<matplotlib.lines.Line2D at 0x108c7d990>]
In our naive universe where we're only paying attention to two variables -- academic performance and free/reduced lunch -- we can clearly see that some percentage of schools is overperforming the performance that would be expected of them, taking poverty out of the equation.
A handful, in particular, seem to be dramatically overperforming. Let's look at them:
In [22]:
df[(df['MEALS'] >= 80) & (df['API12B'] >= 900)]
Out[22]:
CHARTER
SNAME
DNAME
CNAME
API12B
ST_RANK
PCT_AA
PCT_AI
PCT_AS
PCT_FI
...
ACS_K3
ACS_46
ACS_CORE
PCT_RESP
NOT_HSG
HSG
SOME_COL
COL_GRAD
GRAD_SCH
AVG_ED
15
NaN
Chin (John Yehall) Elementary
San Francisco Unified
San Francisco
989
10
1
0
88
1
...
21
27
NaN
96
17
47
11
22
3
2.47
51
D
American Indian Public Charter School II
Oakland Unified
Alameda
978
10
5
1
84
0
...
NaN
2
NaN
14
40
14
26
21
0
2.28
161
D
KIPP Raices Academy
Los Angeles Unified
Los Angeles
961
10
2
0
0
0
...
NaN
NaN
NaN
93
14
36
35
8
7
2.59
207
NaN
Lincoln Elementary
Oakland Unified
Alameda
957
10
3
0
87
2
...
24
30
NaN
46
19
37
18
17
8
2.59
221
D
Oakland Charter High
Oakland Unified
Alameda
956
10
5
0
29
0
...
NaN
NaN
5
70
44
35
15
6
0
1.83
445
NaN
Richardson Prep Hi
San Bernardino City Unified
San Bernardino
939
10
11
1
2
1
...
NaN
32
29
76
26
28
22
13
11
2.54
510
D
Synergy Charter Academy
Los Angeles Unified
Los Angeles
935
10
4
0
0
0
...
NaN
NaN
NaN
77
40
28
24
7
1
2.03
542
Y
Sixth Street Prep
Victor Elementary
San Bernardino
933
10
8
1
1
0
...
22
34
NaN
92
36
23
20
9
13
2.38
683
D
KIPP Los Angeles College Preparatory
Los Angeles Unified
Los Angeles
925
10
1
0
2
0
...
NaN
NaN
NaN
99
28
37
27
4
4
2.20
702
D
Rocketship Mateo Sheedy Elementary
Santa Clara County Office of E
Santa Clara
924
10
1
0
1
0
...
24
25
NaN
91
45
24
14
11
7
2.10
739
NaN
Solano Avenue Elementary
Los Angeles Unified
Los Angeles
922
10
4
1
50
2
...
19
17
NaN
89
19
26
28
20
7
2.70
788
NaN
Laurel Street Elementary
Compton Unified
Los Angeles
920
9
20
0
0
0
...
28
31
NaN
90
46
31
16
5
1
1.84
865
NaN
Middle College High
San Bernardino City Unified
San Bernardino
916
10
9
1
1
2
...
NaN
NaN
22
85
22
29
30
14
5
2.51
939
NaN
Design Science Early College High
Fresno Unified
Fresno
913
10
7
0
18
0
...
NaN
NaN
29
95
18
31
35
8
8
2.56
964
NaN
Frank Lanterman
Los Angeles Unified
Los Angeles
912
C
12
0
2
4
...
NaN
NaN
11
49
31
31
18
16
4
2.31
1054
D
Futuro College Preparatory Elementary
Los Angeles Unified
Los Angeles
908
9
0
0
0
0
...
32
NaN
NaN
98
28
29
31
8
5
2.32
1071
D
KIPP Heartwood Academy
Alum Rock Union Elementary
Santa Clara
907
9
2
0
15
2
...
NaN
34
31
91
39
26
19
12
4
2.16
17 rows × 30 columns
Let's look specifically at Solano Avenue Elementary, which has an API of 922 and 80 percent of students being in the free/reduced lunch program. If you were to use the above regression to predict how well Solano would do, it would look like this:
In [14]:
lr.predict(80)
Out[14]:
array([ 749.6728627])
With an index of 922, clearly the school is overperforming what our simplified model expects.
Content source: datapolitan/lede_algorithms
Similar notebooks: