nyc-schools_D

This script imports the *.csv file produced by nyc-schools_C.ipynb and builds regression and classification models aiming to predict school outcome from local Census variables.



In [83]:

    
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from copy import deepcopy
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from collections import Counter, defaultdict
%matplotlib inline

bp_data = '/Users/bryanfry/projects/proj_nyc-schools/data_files'

Functions to evaluate classification models



In [2]:

    
# Function to binarize a variable -- returns True if between given floor and ceiling, 
# otherwise returns False.
def binarize (d, pos_floor, pos_ceiling):
    d = np.array (d)
    r = ( (d > pos_floor) & (d < pos_ceiling))
    return r

#################################################

# Given a list of dictionaries, sums values with matching keys and then takes avg.
def dict_mean (dict_list):
    n = len (dict_list)
    r = defaultdict(float)
    for d in dict_list:
        for k in d.keys():
            if k in r.keys():
                r[k] = r[k] + float(d[k])
            else:
                r[k] = float (d[k])
    for k in r.keys():
        r[k] = r[k] / n
    return dict (r)

#################################################

# EVALUATE A BINARY CLASSIFICATION MODEL MODEL

# POS_FLOOR and POS_CEILING define a range of positive values.
# So if we want 4th and 5th quintiles, these should be 3.9 and 5.1

# This function executes a specified number of train/test splits on the features
# and labels, and returns average results for accuracy, precision, and recall.
def eval_model (model, name, x, y, pos_floor, pos_ceiling, n_split, \
                test_size = 0.3, random_state = None, verbose = True):
    y = binarize (y, pos_floor, pos_ceiling)

    acc_list, rec_list, prec_list, ctr_test_list, ctr_pred_list =[],[],[],[],[]
    
    for i in range (0, n_split):
        #DBG
        #print x
        #print y
        #print test_size
        #print random_state
        
        x_train, x_test, y_train, y_test = \
            train_test_split (x, y, test_size = test_size, random_state = random_state)

        model.fit (x_train, y_train)
        y_pred = model.predict(x_test)

        acc_list.append (accuracy_score (y_test, y_pred))
        rec_list.append (recall_score(y_test, y_pred))
        prec_list.append ((y_test, y_pred))

        ctr_test_list.append (Counter (y_test))
        ctr_pred_list.append (Counter (y_pred))
    
    acc, prec, rec = np.mean (acc_list), np.mean (rec_list), np.mean (prec_list)
    ctr_test = dict_mean (ctr_test_list)
    ctr_pred = dict_mean (ctr_pred_list)
    
    if verbose == True:
        print '*** ' + name + ' ***   ' 
        print 'ACTUAL LABEL COUNTS: ' + str (ctr_test)
        print 'PREDICTED LABEL COUNTS: ' + str (ctr_pred)
        print 'Accuracy = ' + str(np.around(acc, 3)) + \
        ' / Precision = ' + str (np.around(prec,3)) + \
        ' / Recall = ' + str (np.around (rec,3)) + '\n'    
    
    return acc, prec, rec, ctr_test, ctr_pred

#################################################

# LOOP ON MULTIPLE MODELS

def run_multiple_models (model_list, name_list, target_name, x, y, pos_floor, pos_ceiling, \
                         n_split = 500, test_size = 0.3, normalize = True):
    print '**** RUNNING SERIES OF CLASSIFICATION MODELS ****\n'
    print 'Test Fraction = ' + str (test_size) + '   # Test/Train Splits = ' + str (n_split)
    print 'Target = ' + target_name
    print 'Features = ' + '  '.join (x.columns.tolist()) + '\n'
    
    # Loop on models
    for model, name in zip (model_list, name_list):
        if normalize == True:
            x = preprocessing.normalize(x, norm='l2')
        acc, rec, prec, ctr_test, ctr_pred = eval_model (model, name, x, y, pos_floor, \
                                                         pos_ceiling, n_split, test_size)

Function to normalize model features

Fits each feature in the feature dataframe to a Guassian distribution.



In [25]:

    
def scale_features (df):    
    r = pd.DataFrame()
    for c in df.columns:
        r[c] = (df[c] - df[c].mean()) / df[c].std()
    return r

Functions for visualization



In [228]:

    
def plot_correlation_matrix (df):
    corr = df.corr()  # Compute correlation matrix
    for i in range (len (corr)): corr.ix [i,i] = 0 # Set diagonal to 0, improves visualization
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr, mask = None, cmap=None, vmax=.3,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax, annot = True);
    return None

def plot_scatter_and_fit (x, y, title, x_label, y_label, x_rng, y_rng):
    x = np.reshape (x, (len(x),1))  # Since only one x-column, need to reshape in into 2d Nx1 array
    model = LinearRegression ()
    model.fit (x, y)
    pred = model.predict (x)

    f, ax = plt.subplots(figsize=(11, 9))
    plt.plot (x,pred, color = 'Plum', linewidth = 10, zorder=1)
    plt.scatter (x, y, color = 'DodgerBlue', s = 120, alpha = 0.5, zorder=2)
    plt.title (title, fontsize = 30)
    plt.xlabel(x_label, fontsize = 24)
    plt.ylabel(y_label, fontsize = 24)
    plt.tick_params(axis='both', which='major', labelsize=24)
    plt.tick_params(axis='both', which='minor', labelsize=24)
    plt.ylim (  (np.floor (y.min()), np.ceil(y.max()) ) )
    plt.xlim (  (np.floor (x.min()), np.ceil(x.max()) ) )



In [229]:

    
np.floor (4.3)
np.ceil (4.3)









    Out[229]:





5.0

MAIN

We will be attempting to build models using features derived from the average census variables of the N closest census tracts to each school. Use the file for N=20.

The target variables of interest (from NYC Open Data on education) are the dropout rate and 'advanced regents' graduation rate. Both of these are expressed as percentages of the total graduating cohort. To simplify interpretation, we will change dropout rate to a graduation rate (100 - dropput). This change means that for both target variables, higher values will be interpretted as "good" outcomes.

We will also created a new dataframe where all the features and target values are normalized to best fit a Guassian distribution.



In [246]:

    
# Load Data, create 'graduated' column, and drop empties
df_features, df_grad, df_adv_ref = load_and_normalize_data ( \
                                   os.path.join (bp_data, 'df_C_sch_acs_NTract=10.csv'))



In [247]:

    
df = pd.read_csv (fp_in)
df = df.drop (df.columns[0], axis=1)
df ['GRADUATED_%'] = 100.0 - df['DROPPED_OUT_%']
df = df.dropna()

# Create normalized features and targets
feature_list = ['RENT_INCOME_RATIO', 'FRAC_NONCITIZEN', 'MEDIAN_INCOME',\
                'MEDIAN_AGE', 'FRAC_MINORITY', 'FRAC_MOVED']
df_features = scale_features (df[feature_list])
df_grad = scale_features (pd.DataFrame (df['GRADUATED_%']))
df_advreg = scale_features (pd.DataFrame (df['ADV_REGENTS_%_COHORT']))

Start by examing correlation between features

We will focus on 6 engineered features, derived from the Census data in nyc-school_C.ipynb. Begin by plotting the correlation matrix; very highly correlated features are redundent should probably not be included in the model. NOTE: elements on the diagonal are set to 0 to ease the visualization; they of course actually have values of 1



In [248]:

    
plot_correlation_matrix (df_features)

The strongest correlation is -0.75 between MEDIAN_INCOME and RENT_INCOME_RATIO (not surprising as rent is expected to be a larger fraction of income in poorer neighboorhoods). The second largest correlation, more surprisingly, is 0.7 between MEDIAN_INCOME and FRAC_MOVED (this is the fraction of resisdents who reported moving in the last year). Since median income appears to be highly correlated with two other features, I will remove it from the feature set, and plot the updated correlation matrix.



In [249]:

    
feature_list = ['RENT_INCOME_RATIO', 'FRAC_NONCITIZEN',\
                'MEDIAN_AGE', 'FRAC_MINORITY', 'FRAC_MOVED']
df_features = df_features [feature_list]
plot_correlation_matrix (df_features)

Use statsmodels.OLS (ordinary least squares fit) to examine possible linear relationships between school outcomes and features.

First we will produce linear OLS model using the five remaining features to predict graudation rate.



In [250]:

    
# Add a constant term (all 1's) to the features.  This allow statsmodels.OLS to compute a non-zero
# intercept value.

df_features = sm.tools.tools.add_constant (df_features, prepend = False)
model = sm.OLS (df_grad, df_features)
res = model.fit()
res.summary()









    Out[250]:





OLS Regression Results

  Dep. Variable:        GRADUATED_%      R-squared:             0.058


  Model:                    OLS          Adj. R-squared:        0.040


  Method:              Least Squares     F-statistic:           3.150


  Date:              Mon, 23 Jan 2017    Prob (F-statistic):   0.00888


  Time:                  16:34:08        Log-Likelihood:      -363.44


  No. Observations:          262         AIC:                   738.9


  Df Residuals:              256         BIC:                   760.3


  Df Model:                    5                                     


  Covariance Type:       nonrobust                                   




                       coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  RENT_INCOME_RATIO     -0.2164      0.112     -1.932   0.054     -0.437     0.004


  FRAC_NONCITIZEN       -0.0133      0.077     -0.174   0.862     -0.164     0.138


  MEDIAN_AGE            -0.0170      0.082     -0.206   0.837     -0.179     0.145


  FRAC_MINORITY          0.0953      0.068      1.408   0.160     -0.038     0.228


  FRAC_MOVED             0.0710      0.085      0.839   0.402     -0.096     0.238


  const              -1.091e-14      0.061   -1.8e-13   1.000     -0.119     0.119




  Omnibus:        13.069    Durbin-Watson:         1.899


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     12.481


  Skew:           -0.481    Prob(JB):            0.00195


  Kurtosis:        2.534    Cond. No.               3.61

Of the five features, only RENT_INCOME_RATIO has a p-value < 0.05. The negative value for the coeffient (-0.207) indicates that higher RENT_INCOME_RATIO correlates with lower graduation rates; this is an intuitive result. The next smallest p-value is 0.159 for FRAC_MINORITY (0.159), and the positive corefficient indicates that schools in neighboorhoods with larger minority populations exhibit HIGHER graduation rates, although the magnitude of the effect is not statistically significant.

We will compute linear OLS model using the same five features to predict graduation with advanced regents



In [251]:

    
model = sm.OLS (df_advreg, df_features)
res = model.fit()
res.summary()









    Out[251]:





OLS Regression Results

  Dep. Variable:     ADV_REGENTS_%_COHORT    R-squared:             0.060


  Model:                      OLS            Adj. R-squared:        0.042


  Method:                Least Squares       F-statistic:           3.277


  Date:                Mon, 23 Jan 2017      Prob (F-statistic):   0.00691


  Time:                    16:34:09          Log-Likelihood:      -363.13


  No. Observations:            262           AIC:                   738.3


  Df Residuals:                256           BIC:                   759.7


  Df Model:                      5                                       


  Covariance Type:         nonrobust                                     




                       coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  RENT_INCOME_RATIO     -0.0304      0.112     -0.272   0.786     -0.251     0.190


  FRAC_NONCITIZEN       -0.0349      0.077     -0.456   0.649     -0.186     0.116


  MEDIAN_AGE             0.1865      0.082      2.264   0.024      0.024     0.349


  FRAC_MINORITY         -0.0886      0.068     -1.312   0.191     -0.222     0.044


  FRAC_MOVED            -0.0933      0.085     -1.104   0.271     -0.260     0.073


  const               1.596e-16      0.060   2.64e-15   1.000     -0.119     0.119




  Omnibus:        133.806    Durbin-Watson:         2.062 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    536.112 


  Skew:            2.219     Prob(JB):           3.84e-117


  Kurtosis:        8.424     Cond. No.               3.61

The only statistically significant feature is MEDIAN_AGE. The positive coefficient (0.1807) indicates that school located in neighboorhood with more old people produce more advanced regents graduates. Interesting.

NOTE: The data shown here using the input file where census variables for the 10 nearest census tracts are averaged for each school. Running these models with averages for 5, 20, or 50 nearest tracts produce slightly different results of course, but do not change the conclusions regarding feature significance!

Plot linear regression models for graduation and advanced regents rates

We will use sklearn to train simple regression models for each target value, using only the features that have been shown above to be statistically significant. Make scatter plots and fits for the models.



In [252]:

    
x = df_features['RENT_INCOME_RATIO'].values
y = df_grad.values

plot_scatter_and_fit (x, y, '\nGraduation Rate vs. Rent / Income\n', '\nNormalized Rent / Income\n', \
                      'Normalized Graduation Rate\n', x_rng=None, y_rng=None)



In [253]:

    
x = df_features['MEDIAN_AGE'].values
y = df_advreg.values

plot_scatter_and_fit (x, y, '\nAdvanced Regents Graduation vs. Median Age\n', '\nMedian Age (Normalized)', \
                      'Advanced Regents Graduation (Norm.)\n', x_rng=None, y_rng=None)

Neither of these regression models is very convincing. Furthermore, it is clear that both the features and target are not close to being nromally distributed. We will put regression aside for now and try to develop classification models to predict whether schools will fall in the top 40% (top two quintiles) for overall graduation rate and advanced regents graduations.

Attempt to classify schools in top 2 quintiles (top 40%) in graduation rate

We are putting aside regression and moving on to binary classification models. The quintiles for graduation will be the 'inverse' of those of drop-out rate (already computed), so we take (6-x) to get graduation rate quintiles.

We will use several models to attempt to classify schools into the top 40% or bottom 40% for overall graduation rate. Each model will be run with 100 independent random train/test splits, with 30% of the schools relegated to the test set in each case. For each model, compute average accuracy, precision, and recall for the 100 splits. The following models are tested:

Logistic Regression
Linear Support Vector Classifier
k-Nearest Neighbors, with k=1, k=3, and k=5
Random Forest



In [259]:

    
y = 6.0 - df['Q_DROPPED_OUT_%']
x = df_features
floor, ceiling = 3.99, 5.01 # Define range of top 2 quintiles

# List of classification models to try, along with labels to identify the models.
model_list = [LogisticRegression(),SVC(), KNeighborsClassifier(n_neighbors = 5),KNeighborsClassifier(n_neighbors = 3),\
             KNeighborsClassifier(n_neighbors = 1), \
             RandomForestClassifier (n_estimators = 10)]
name_list= ['Logistic_Regression', 'SVC', 'k-Nearest Neighbors N=5',\
            'k-Nearest Neighbors N=3', 'k-Nearest Neighbors N=1', 'RANDOM_FOREST']

run_multiple_models (model_list, name_list, 'Top 40% Graduation Rate', x, y, floor, \
                     ceiling, n_split = 100, test_size = 0.3, normalize=True)









    



**** RUNNING SERIES OF CLASSIFICATION MODELS ****

Test Fraction = 0.3   # Test/Train Splits = 100
Target = Top 40% Graduation Rate
Features = RENT_INCOME_RATIO  FRAC_NONCITIZEN  MEDIAN_AGE  FRAC_MINORITY  FRAC_MOVED  const

*** Logistic_Regression ***   
ACTUAL LABEL COUNTS: {False: 47.68, True: 31.32}
PREDICTED LABEL COUNTS: {False: 64.71, True: 14.29}
Accuracy = 0.588 / Precision = 0.215 / Recall = 0.289

*** SVC ***   
ACTUAL LABEL COUNTS: {False: 47.12, True: 31.88}
PREDICTED LABEL COUNTS: {False: 75.51, True: 3.49}
Accuracy = 0.582 / Precision = 0.044 / Recall = 0.224

*** k-Nearest Neighbors N=5 ***   
ACTUAL LABEL COUNTS: {False: 47.39, True: 31.61}
PREDICTED LABEL COUNTS: {False: 51.68, True: 27.32}
Accuracy = 0.514 / Precision = 0.328 / Recall = 0.373

*** k-Nearest Neighbors N=3 ***   
ACTUAL LABEL COUNTS: {False: 46.97, True: 32.03}
PREDICTED LABEL COUNTS: {False: 51.82, True: 27.18}
Accuracy = 0.528 / Precision = 0.346 / Recall = 0.375

*** k-Nearest Neighbors N=1 ***   
ACTUAL LABEL COUNTS: {False: 47.25, True: 31.75}
PREDICTED LABEL COUNTS: {False: 48.44, True: 30.56}
Accuracy = 0.521 / Precision = 0.387 / Recall = 0.394

*** RANDOM_FOREST ***   
ACTUAL LABEL COUNTS: {False: 46.34, True: 32.66}
PREDICTED LABEL COUNTS: {False: 54.64, True: 24.36}
Accuracy = 0.533 / Precision = 0.312 / Recall = 0.361

Unfortunately, none of these models exhibit reliable classification. The best accuracy is 0.588 (we would expect 0.6 accuracy with random assignment, given the class imbalance). All of the models underpredict the number of schools in the top 40% of graduation rates, although this issue is particular pronounced with the Logistic Regression and SVC models.

Now to classify schools in top 2 quintiles (top 40%) for Advanced Regenets graduation rate.



In [260]:

    
y = 6.0 - df['Q_ADV_REGENTS_%_COHORT']
x = df_features
floor, ceiling = 3.99, 5.01 # Define range of top 2 quintiles

# List of classification models to try, along with labels to identify the models.
model_list = [LogisticRegression(),SVC(), KNeighborsClassifier(n_neighbors = 5),KNeighborsClassifier(n_neighbors = 3),\
             KNeighborsClassifier(n_neighbors = 1), \
             RandomForestClassifier (n_estimators = 10)]
name_list= ['Logistic_Regression', 'SVC', 'k-Nearest Neighbors N=5',\
            'k-Nearest Neighbors N=3', 'k-Nearest Neighbors N=1', 'RANDOM_FOREST']

run_multiple_models (model_list, name_list, 'Top 40% Graduation Rate', x, y, floor, \
                     ceiling, n_split = 100, test_size = 0.3, normalize=True)









    



**** RUNNING SERIES OF CLASSIFICATION MODELS ****

Test Fraction = 0.3   # Test/Train Splits = 100
Target = Top 40% Graduation Rate
Features = RENT_INCOME_RATIO  FRAC_NONCITIZEN  MEDIAN_AGE  FRAC_MINORITY  FRAC_MOVED  const

*** Logistic_Regression ***   
ACTUAL LABEL COUNTS: {False: 48.05, True: 30.95}
PREDICTED LABEL COUNTS: {False: 66.75, True: 12.25}
Accuracy = 0.596 / Precision = 0.191 / Recall = 0.273

*** SVC ***   
ACTUAL LABEL COUNTS: {False: 47.67, True: 31.33}
PREDICTED LABEL COUNTS: {False: 78.24, True: 0.76}
Accuracy = 0.601 / Precision = 0.011 / Recall = 0.203

*** k-Nearest Neighbors N=5 ***   
ACTUAL LABEL COUNTS: {False: 47.45, True: 31.55}
PREDICTED LABEL COUNTS: {False: 52.58, True: 26.42}
Accuracy = 0.537 / Precision = 0.345 / Recall = 0.367

*** k-Nearest Neighbors N=3 ***   
ACTUAL LABEL COUNTS: {False: 47.96, True: 31.04}
PREDICTED LABEL COUNTS: {False: 47.21, True: 31.79}
Accuracy = 0.533 / Precision = 0.422 / Recall = 0.398

*** k-Nearest Neighbors N=1 ***   
ACTUAL LABEL COUNTS: {False: 47.36, True: 31.64}
PREDICTED LABEL COUNTS: {False: 43.37, True: 35.63}
Accuracy = 0.535 / Precision = 0.486 / Recall = 0.426

*** RANDOM_FOREST ***   
ACTUAL LABEL COUNTS: {False: 47.57, True: 31.43}
PREDICTED LABEL COUNTS: {False: 49.89, True: 29.11}
Accuracy = 0.561 / Precision = 0.414 / Recall = 0.383

The model performance is similar to that for overall graduation rate. SVC shows the best accuracy at 0.6 correct classification, at the cost of assigning essentially all results to a single class!

Conclusion

The Census variables examined here are not useful predictors of NYC public school outcomes, based on the physical locations of the schools. The Census data were averages of responses gathered from 2009-2014, while the school outcomes were taken from the 2010 cohort (having attended high school from 2006-2010). Although neighborhoods can change significantly over the course of several years, I suspect this temporal distinction is not the main reason for the failure to predict outcomes. It is more likely that the cohorts in NYC school are simply not representative of the areas around those schools, because students oftent do not attend schools near their homes. This contrasts with many other municipalities, where public schools are zoned primarily by geographic proximity.

NOTE: This notebook illustrates school outcome prediction using the 'N=10' data file, where Census variables are averaged for the 10 closest tracts for each school. The analysis has also been performed using N=5, N=20, and N=50. There were no significant differences in outcome for these other averaging parameters.



In [ ]:

Dep. Variable:	GRADUATED_%	R-squared:	0.058
Model:	OLS	Adj. R-squared:	0.040
Method:	Least Squares	F-statistic:	3.150
Date:	Mon, 23 Jan 2017	Prob (F-statistic):	0.00888
Time:	16:34:08	Log-Likelihood:	-363.44
No. Observations:	262	AIC:	738.9
Df Residuals:	256	BIC:	760.3
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
RENT_INCOME_RATIO	-0.2164	0.112	-1.932	0.054	-0.437 0.004
FRAC_NONCITIZEN	-0.0133	0.077	-0.174	0.862	-0.164 0.138
MEDIAN_AGE	-0.0170	0.082	-0.206	0.837	-0.179 0.145
FRAC_MINORITY	0.0953	0.068	1.408	0.160	-0.038 0.228
FRAC_MOVED	0.0710	0.085	0.839	0.402	-0.096 0.238
const	-1.091e-14	0.061	-1.8e-13	1.000	-0.119 0.119

Omnibus:	13.069	Durbin-Watson:	1.899
Prob(Omnibus):	0.001	Jarque-Bera (JB):	12.481
Skew:	-0.481	Prob(JB):	0.00195
Kurtosis:	2.534	Cond. No.	3.61

Dep. Variable:	ADV_REGENTS_%_COHORT	R-squared:	0.060
Model:	OLS	Adj. R-squared:	0.042
Method:	Least Squares	F-statistic:	3.277
Date:	Mon, 23 Jan 2017	Prob (F-statistic):	0.00691
Time:	16:34:09	Log-Likelihood:	-363.13
No. Observations:	262	AIC:	738.3
Df Residuals:	256	BIC:	759.7
Df Model:	5
Covariance Type:	nonrobust

Omnibus:	133.806	Durbin-Watson:	2.062
Prob(Omnibus):	0.000	Jarque-Bera (JB):	536.112
Skew:	2.219	Prob(JB):	3.84e-117
Kurtosis:	8.424	Cond. No.	3.61