Based on feedback from our TF on 12/7, we attempted several forms of regression to see how well we could predict numerical graduation rate with school district characteristics alone, then with previous years graduation rates, then with school district characteristics and previous years graduation rates.
One opportunity in this space is that the U.S. Department of Education typically delays making graduation rate data available. For instance, it is the 2015-2016 school year, and the most current graduation rate data available is for the year 2009-2010. If we could build a model to predict graduation rate for the years of missing data, organizations that rely on graduation rate data to provide schools services could use this graduation rate approximation until newer graduation rate data becomes available.
For this notebook, we pulled 3 previous years of graduation rate data (2006-2007, 2007-2008, and 2008-2009). First we built regression models using school district data alone and predicting numerical graduation rate, then we built regression models using historic graduation rate data alone and predicting numerical graduation rate, then we built regression models using school district data and historic graduation rate data and predicting numerical graduation rate, and lastly we built a regression model using 2006-2007 school district data and fed it new 2009-2010 school district data to see how well it would predict 2009-2010 numerical graduation rate.
We compared the models using mean squared error, with the lower the mean squared error, the better.
In [24]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib import rcParams
import sklearn
import statsmodels.api as sm
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
In [3]:
grad0607 = pd.read_csv("data/rawdata/districts/prevyears/dr06p1b.txt", dtype=np.str, delim_whitespace=True)
grad0708 = pd.read_csv("data/rawdata/districts/prevyears/dr07p1b.txt", dtype=np.str, delim_whitespace=True)
grad0809 = pd.read_csv("data/rawdata/districts/prevyears/dr08p1a.txt", dtype=np.str, delim_whitespace=True)
In [4]:
grad0607 = grad0607.replace('-1', '')
grad0607 = grad0607.replace('-2', '')
grad0607 = grad0607.replace('-3', '')
grad0607 = grad0607.replace('-4', '')
grad0607 = grad0607.replace('-9', '')
grad0607 = grad0607.replace('-1.0', '')
grad0607 = grad0607.replace('-2.0', '')
grad0607 = grad0607.replace('-3.0', '')
grad0607 = grad0607.replace('-4.0', '')
grad0607 = grad0607.replace('-9.0', '')
grad0607.head()
Out[4]:
In [5]:
grad0708 = grad0708.replace('-1', '')
grad0708 = grad0708.replace('-2', '')
grad0708 = grad0708.replace('-3', '')
grad0708 = grad0708.replace('-4', '')
grad0708 = grad0708.replace('-9', '')
grad0708 = grad0708.replace('-1.0', '')
grad0708 = grad0708.replace('-2.0', '')
grad0708 = grad0708.replace('-3.0', '')
grad0708 = grad0708.replace('-4.0', '')
grad0708 = grad0708.replace('-9.0', '')
grad0708.head()
Out[5]:
In [6]:
grad0809 = grad0809.replace('-1', '')
grad0809 = grad0809.replace('-2', '')
grad0809 = grad0809.replace('-3', '')
grad0809 = grad0809.replace('-4', '')
grad0809 = grad0809.replace('-9', '')
grad0809 = grad0809.replace('-1.0', '')
grad0809 = grad0809.replace('-2.0', '')
grad0809 = grad0809.replace('-3.0', '')
grad0809 = grad0809.replace('-4.0', '')
grad0809 = grad0809.replace('-9.0', '')
grad0809.head()
Out[6]:
In [7]:
grad0809.dtypes
Out[7]:
In [9]:
grad0607=grad0607.replace([np.inf, -np.inf], np.nan)
grad0708=grad0708.replace([np.inf, -np.inf], np.nan)
grad0809=grad0809.replace([np.inf, -np.inf], np.nan)
grad0607['AFGR'] = grad0607['AFGR'].replace('', np.nan)
grad0607['AFGR'] = grad0607['AFGR'].astype(float)
grad0607['LEAID'] = grad0607['LEAID'].astype(int)
grad0607 = grad0607.rename(columns={'LEAID': 'agency_id_nces'})
grad0607 = grad0607.rename(columns={'AFGR': 'gradrate0607'})
grad0607 = grad0607.drop(grad0607.columns[[0, 1, 3, 4, 5, 6, 7, 9]], 1)
grad0708['afgr'] = grad0708['afgr'].replace('', np.nan)
grad0708['afgr'] = grad0708['afgr'].astype(float)
grad0708['leaid'] = grad0708['leaid'].astype(int)
grad0708 = grad0708.rename(columns={'leaid': 'agency_id_nces'})
grad0708 = grad0708.rename(columns={'afgr': 'gradrate0708'})
grad0708 = grad0708.drop(grad0708.columns[[0, 1, 3, 4, 5, 6, 7, 9]], 1)
grad0809['AFGR'] = grad0809['AFGR'].replace('', np.nan)
grad0809['AFGR'] = grad0809['AFGR'].astype(float)
grad0809['LEAID'] = grad0809['LEAID'].astype(int)
grad0809 = grad0809.rename(columns={'LEAID': 'agency_id_nces'})
grad0809 = grad0809.rename(columns={'AFGR': 'gradrate0809'})
grad0809 = grad0809.drop(grad0809.columns[[0, 1, 3, 4, 5, 6, 7, 9]], 1)
grad0809.head()
Out[9]:
In [12]:
filtered = pd.read_csv("data/finaldata/filtered.csv")
print filtered.shape
filtered = filtered.merge(grad0607, 'left', 'agency_id_nces', suffixes=('', '_DEL'))
filtered = filtered.merge(grad0708, 'left', 'agency_id_nces', suffixes=('', '_DEL'))
filtered = filtered.merge(grad0809, 'left', 'agency_id_nces', suffixes=('', '_DEL'))
print filtered.shape
filtered.head()
Out[12]:
In [13]:
print filtered['gradrate0607'].isnull().sum()
print filtered['gradrate0708'].isnull().sum()
print filtered['gradrate0809'].isnull().sum()
In [14]:
filtered.to_csv("data/finaldata/filtered_withprevyears.csv", index=False)
In [15]:
dftouse = filtered.copy(deep=True)
In [16]:
#Drop identification data
dftouse.drop('agency', axis=1, inplace=True)
dftouse.drop('state', axis=1, inplace=True)
dftouse.drop('state_abbr', axis=1, inplace=True)
dftouse.drop('agency_id_nces', axis=1, inplace=True)
dftouse.drop('county', axis=1, inplace=True)
dftouse.drop('county_number', axis=1, inplace=True)
dftouse.drop('report_years', axis=1, inplace=True)
dftouse.drop('no_report_years', axis=1, inplace=True)
dftouse.drop('address', axis=1, inplace=True)
dftouse.drop('city', axis=1, inplace=True)
dftouse.drop('add_state', axis=1, inplace=True)
dftouse.drop('zipcode', axis=1, inplace=True)
dftouse.drop('latitude', axis=1, inplace=True)
dftouse.drop('longitude', axis=1, inplace=True)
dftouse.drop('agency_id_state', axis=1, inplace=True)
dftouse.drop('congressional_code', axis=1, inplace=True)
dftouse.drop('census_id', axis=1, inplace=True)
dftouse.drop('offered_g_lowest', axis=1, inplace=True)
#Drop indicator columns that are all 1 or 0 - found through KDE
dftouse.drop('i_agency_type_sup_union_admin', axis=1, inplace=True)
dftouse.drop('i_agency_type_state_operated_institution', axis=1, inplace=True)
dftouse.drop('i_agency_type_other_education_agency', axis=1, inplace=True)
dftouse.drop('i_fin_sdlc_elem', axis=1, inplace=True)
dftouse.drop('i_fin_sdlc_nonop', axis=1, inplace=True)
dftouse.drop('i_fin_sdlc_ed_serv', axis=1, inplace=True)
dftouse.drop('i_lgo_10', axis=1, inplace=True)
dftouse.drop('i_lgo_11', axis=1, inplace=True)
dftouse.drop('i_lgo_12', axis=1, inplace=True)
dftouse.drop('i_lgo_1', axis=1, inplace=True)
dftouse.drop('i_lgo_2', axis=1, inplace=True)
dftouse.drop('i_lgo_3', axis=1, inplace=True)
dftouse.drop('i_lgo_4', axis=1, inplace=True)
dftouse.drop('i_lgo_5', axis=1, inplace=True)
dftouse.drop('i_lgo_6', axis=1, inplace=True)
dftouse.drop('i_lgo_7', axis=1, inplace=True)
dftouse.drop('i_lgo_8', axis=1, inplace=True)
dftouse.drop('i_lgo_9', axis=1, inplace=True)
dftouse.drop('i_lgo_U', axis=1, inplace=True)
#Drop columns related to graduation rate
dftouse.drop('fipst', axis=1, inplace=True)
dftouse.drop('totd912', axis=1, inplace=True)
dftouse.drop('ebs912', axis=1, inplace=True)
dftouse.drop('drp912', axis=1, inplace=True)
dftouse.drop('totdpl', axis=1, inplace=True)
dftouse.drop('afgeb', axis=1, inplace=True)
dftouse.drop('totohc', axis=1, inplace=True)
In [17]:
#Drop gender/race information for non 12th grade for now
dftouse.drop(['r_stud_reg_PK_AIAN_M','r_stud_reg_PK_AIAN_F','r_stud_reg_PK_AAP_M','r_stud_reg_PK_AAP_F','r_stud_reg_PK_H_M','r_stud_reg_PK_H_F','r_stud_reg_PK_B_M','r_stud_reg_PK_B_F','r_stud_reg_PK_W_M','r_stud_reg_PK_W_F','r_stud_reg_PK_HNPI_M','r_stud_reg_PK_HNPI_F','r_stud_reg_PK_Two_M','r_stud_reg_PK_Two_F','r_stud_reg_K_AIAN_M','r_stud_reg_K_AIAN_F','r_stud_reg_K_AAP_M','r_stud_reg_K_AAP_F','r_stud_reg_K_H_M','r_stud_reg_K_H_F','r_stud_reg_K_B_M','r_stud_reg_K_B_F','r_stud_reg_K_W_M','r_stud_reg_K_W_F','r_stud_reg_K_HNPI_M','r_stud_reg_K_HNPI_F','r_stud_reg_K_Two_M','r_stud_reg_K_Two_F','r_stud_reg_1_AIAN_M','r_stud_reg_1_AIAN_F','r_stud_reg_1_AAP_M','r_stud_reg_1_AAP_F','r_stud_reg_1_H_M','r_stud_reg_1_H_F','r_stud_reg_1_B_M','r_stud_reg_1_B_F','r_stud_reg_1_W_M','r_stud_reg_1_W_F','r_stud_reg_1_HNPI_M','r_stud_reg_1_HNPI_F','r_stud_reg_1_Two_M','r_stud_reg_1_Two_F','r_stud_reg_2_AIAN_M','r_stud_reg_2_AIAN_F','r_stud_reg_2_AAP_M','r_stud_reg_2_AAP_F','r_stud_reg_2_H_M','r_stud_reg_2_H_F','r_stud_reg_2_B_M','r_stud_reg_2_B_F','r_stud_reg_2_W_M','r_stud_reg_2_W_F','r_stud_reg_2_HNPI_M','r_stud_reg_2_HNPI_F','r_stud_reg_2_Two_M','r_stud_reg_2_Two_F','r_stud_reg_3_AIAN_M','r_stud_reg_3_AIAN_F','r_stud_reg_3_AAP_M','r_stud_reg_3_AAP_F','r_stud_reg_3_H_M','r_stud_reg_3_H_F','r_stud_reg_3_B_M','r_stud_reg_3_B_F','r_stud_reg_3_W_M','r_stud_reg_3_W_F','r_stud_reg_3_HNPI_M','r_stud_reg_3_HNPI_F','r_stud_reg_3_Two_M','r_stud_reg_3_Two_F','r_stud_reg_4_AIAN_M','r_stud_reg_4_AIAN_F','r_stud_reg_4_AAP_M','r_stud_reg_4_AAP_F','r_stud_reg_4_H_M','r_stud_reg_4_H_F','r_stud_reg_4_B_M','r_stud_reg_4_B_F','r_stud_reg_4_W_M','r_stud_reg_4_W_F','r_stud_reg_4_HNPI_M','r_stud_reg_4_HNPI_F','r_stud_reg_4_Two_M','r_stud_reg_4_Two_F','r_stud_reg_5_AIAN_M','r_stud_reg_5_AIAN_F','r_stud_reg_5_AAP_M','r_stud_reg_5_AAP_F','r_stud_reg_5_H_M','r_stud_reg_5_H_F','r_stud_reg_5_B_M','r_stud_reg_5_B_F','r_stud_reg_5_W_M','r_stud_reg_5_W_F','r_stud_reg_5_HNPI_M','r_stud_reg_5_HNPI_F','r_stud_reg_5_Two_M','r_stud_reg_5_Two_F','r_stud_reg_6_AIAN_M','r_stud_reg_6_AIAN_F','r_stud_reg_6_AAP_M','r_stud_reg_6_AAP_F','r_stud_reg_6_H_M','r_stud_reg_6_H_F','r_stud_reg_6_B_M','r_stud_reg_6_B_F','r_stud_reg_6_W_M','r_stud_reg_6_W_F','r_stud_reg_6_HNPI_M','r_stud_reg_6_HNPI_F','r_stud_reg_6_Two_M','r_stud_reg_6_Two_F','r_stud_reg_7_AIAN_M','r_stud_reg_7_AIAN_F','r_stud_reg_7_AAP_M','r_stud_reg_7_AAP_F','r_stud_reg_7_H_M','r_stud_reg_7_H_F','r_stud_reg_7_B_M','r_stud_reg_7_B_F','r_stud_reg_7_W_M','r_stud_reg_7_W_F','r_stud_reg_7_HNPI_M','r_stud_reg_7_HNPI_F','r_stud_reg_7_Two_M','r_stud_reg_7_Two_F','r_stud_reg_8_AIAN_M','r_stud_reg_8_AIAN_F','r_stud_reg_8_AAP_M','r_stud_reg_8_AAP_F','r_stud_reg_8_H_M','r_stud_reg_8_H_F','r_stud_reg_8_B_M','r_stud_reg_8_B_F','r_stud_reg_8_W_M','r_stud_reg_8_W_F','r_stud_reg_8_HNPI_M','r_stud_reg_8_HNPI_F','r_stud_reg_8_Two_M','r_stud_reg_8_Two_F','r_stud_reg_9_AIAN_M','r_stud_reg_9_AIAN_F','r_stud_reg_9_AAP_M','r_stud_reg_9_AAP_F','r_stud_reg_9_H_M','r_stud_reg_9_H_F','r_stud_reg_9_B_M','r_stud_reg_9_B_F','r_stud_reg_9_W_M','r_stud_reg_9_W_F','r_stud_reg_9_HNPI_M','r_stud_reg_9_HNPI_F','r_stud_reg_9_Two_M','r_stud_reg_9_Two_F','r_stud_reg_10_AIAN_M','r_stud_reg_10_AIAN_F','r_stud_reg_10_AAP_M','r_stud_reg_10_AAP_F','r_stud_reg_10_H_M','r_stud_reg_10_H_F','r_stud_reg_10_B_M','r_stud_reg_10_B_F','r_stud_reg_10_W_M','r_stud_reg_10_W_F','r_stud_reg_10_HNPI_M','r_stud_reg_10_HNPI_F','r_stud_reg_10_Two_M','r_stud_reg_10_Two_F','r_stud_reg_11_AIAN_M','r_stud_reg_11_AIAN_F','r_stud_reg_11_AAP_M','r_stud_reg_11_AAP_F','r_stud_reg_11_H_M','r_stud_reg_11_H_F','r_stud_reg_11_B_M','r_stud_reg_11_B_F','r_stud_reg_11_W_M','r_stud_reg_11_W_F','r_stud_reg_11_HNPI_M','r_stud_reg_11_HNPI_F','r_stud_reg_11_Two_M','r_stud_reg_11_Two_F','r_stud_reg_U_AIAN_M','r_stud_reg_U_AIAN_F','r_stud_reg_U_AAP_M','r_stud_reg_U_AAP_F','r_stud_reg_U_H_M','r_stud_reg_U_H_F','r_stud_reg_U_B_M','r_stud_reg_U_B_F','r_stud_reg_U_W_M','r_stud_reg_U_W_F','r_stud_reg_U_HNPI_M','r_stud_reg_U_HNPI_F','r_stud_reg_U_Two_M','r_stud_reg_U_Two_F'], axis=1, inplace=True)
#Drop rates of students in PK-12th grade
dftouse.drop(['r_stud_PK', 'r_stud_K', 'r_stud_1', 'r_stud_2', 'r_stud_3', 'r_stud_4', 'r_stud_5', 'r_stud_6', 'r_stud_7', 'r_stud_8', 'r_stud_9', 'r_stud_10', 'r_stud_11', 'r_stud_12', 'r_stud_U'], axis=1, inplace=True)
In [18]:
#This is the fix for replacing NaN with Mean to fix a replace with 0 issue. There is the potential that variability could be reduced, yet we already checked for invalid columns in a prior step due to NaN, so we do not find this to be of great concern.
#This is much better than the previous mistake of replacing with 0.
#https://www.quora.com/Are-we-doing-justice-with-our-data-set-when-we-replace-NaN-values-with-mean-median-or-mode-most-frequent-value
for col in dftouse.columns:
if(dftouse[col].dtype == np.float64):
dftouse[col].fillna(value=np.mean(dftouse[col]), inplace=True)
In [19]:
print dftouse.shape
dftouse.head()
Out[19]:
In [20]:
dftouse.to_csv("data/finaldata/dftouse_withprevyears.csv", index=False)
These are the steps to take the 06-07 info and append the 09-10 grad info.
In [21]:
filtered0607 = pd.read_csv("data/finaldata/filtered_0607.csv")
filtered0607.head()
Out[21]:
In [22]:
filtered = pd.read_csv("data/finaldata/filtered.csv")
print filtered.shape
gradonly = filtered[['agency_id_nces', 'afgr']]
filtered0607 = filtered0607.merge(gradonly, 'left', 'agency_id_nces', suffixes=('', '_DEL'))
filtered0607.head()
Out[22]:
In [23]:
filtered0607 = filtered0607.rename(columns={'afgr_DEL': 'afgr0910'})
filtered0607.to_csv("data/finaldata/dftouse_0607_with0910grad.csv", index=False)
In [25]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Generic classification and optimization functions from cs019 lab7
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# clf - original classifier
# parameters - grid to search over
# X - usually your training X matrix
# y - usually your training y
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
best = gs.best_estimator_
return best
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Important parameters
# indf - Input dataframe
# featurenames - vector of names of predictors
# targetname - name of column you want to predict (e.g. 0 or 1, 'M' or 'F',
# 'yes' or 'no')
# target1val - particular value you want to have as a 1 in the target
# mask - boolean vector indicating test set (~mask is training set)
# reuse_split - dictionary that contains traning and testing dataframes
# (we'll use this to test different classifiers on the same
# test-train splits)
# score_func - we've used the accuracy as a way of scoring algorithms but
# this can be more general later on
# n_folds - Number of folds for cross validation ()
# n_jobs - used for parallelization
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5, n_jobs=1):
subdf=indf[featurenames]
X=subdf.values
y=indf[targetname].values
if mask !=None:
print "using mask"
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
if reuse_split !=None:
print "using reuse split"
Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
if parameters:
clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func)
clf=clf.fit(Xtrain, ytrain)
#training_accuracy = clf.score(Xtrain, ytrain)
#test_accuracy = clf.score(Xtest, ytest)
#print "############# based on standard predict ################"
#print "Accuracy on training data: %0.2f" % (training_accuracy)
#print "Accuracy on test data: %0.2f" % (test_accuracy)
#print confusion_matrix(ytest, clf.predict(Xtest))
#print "########################################################"
return clf, Xtrain, ytrain, Xtest, ytest
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Make ROC Plots
from sklearn.metrics import roc_curve, auc
def make_roc(name, clf, ytest, xtest, ax=None, labe=5, proba=True, skip=0):
initial=False
if not ax:
ax=plt.gca()
initial=True
if proba:
fpr, tpr, thresholds=roc_curve(ytest, clf.predict_proba(xtest)[:,1])
else:
fpr, tpr, thresholds=roc_curve(ytest, clf.predict(xtest))
roc_auc = auc(fpr, tpr)
if skip:
l=fpr.shape[0]
ax.plot(fpr[0:l:skip], tpr[0:l:skip], '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
else:
ax.plot(fpr, tpr, '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
label_kwargs = {}
label_kwargs['bbox'] = dict(
boxstyle='round,pad=0.3', alpha=0.2,
)
if labe!=None:
for k in xrange(0, fpr.shape[0],labe):
#from https://gist.github.com/podshumok/c1d1c9394335d86255b8
threshold = str(np.round(thresholds[k], 2))
ax.annotate(threshold, (fpr[k], tpr[k]), **label_kwargs)
if initial:
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC')
ax.legend(loc="lower right")
return ax
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
In [26]:
df_yr=pd.read_csv("data/finaldata/dftouse_withprevyears.csv")
print df_yr.shape
df_yr.head()
Out[26]:
In [27]:
# Removing rows containing NaN for the model that will make use of previous years grad rate.
df_yr = df_yr[np.isfinite(df_yr['gradrate0607'])]
df_yr = df_yr[np.isfinite(df_yr['gradrate0708'])]
df_yr = df_yr[np.isfinite(df_yr['gradrate0809'])]
df_yr = df_yr[np.isfinite(df_yr['afgr'])]
df_yr.fillna(value=np.nan, inplace=True)
print df_yr.shape
In [28]:
# Baseline Comparisons
from sklearn.metrics import mean_squared_error
y_0607 = df_yr['gradrate0607'].astype(int)
y_0708 = df_yr['gradrate0708'].astype(int)
y_0809 = df_yr['gradrate0809'].astype(int)
y_0910 = df_yr['afgr'].astype(int)
print "Mean Squared Error : "
rscore = mean_squared_error(y_0910, y_0809)
print "Prediction for 2009-10 using 2009-09 : ", rscore
rscore = mean_squared_error(y_0809, y_0708)
print "Prediction for 2008-09 using 2007-08 : ", rscore
rscore = mean_squared_error(y_0708, y_0607)
print "Prediction for 2007-08 using 2006-07 : ", rscore
colors=sns.color_palette()
p1 = plt.scatter(y_0910, y_0809, alpha=0.2, color=colors[0])
p2 = plt.scatter(y_0809, y_0708, alpha=0.2, color=colors[1])
p3 = plt.scatter(y_0708, y_0607, alpha=0.2, color=colors[2])
plt.xlabel("Grad Rate: Current Year")
plt.ylabel("Grad Rate: Previous Year")
plt.title("Grad Rate: Current Year vs Previous Year")
plt.xlim(0,100)
plt.ylim(0,100)
plt.legend((p1, p2, p3),('2009-10', '2009-08', '2008-07'),
scatterpoints=100,
loc='lower left',
ncol=3,
fontsize=20)
plt.show()
In [205]:
fullprevyears = pd.read_csv("data/finaldata/dftouse_withprevyears.csv")
In [206]:
dftouse = fullprevyears.copy(deep=True)
In [207]:
dftouse.shape
Out[207]:
In [208]:
dftouse.head()
Out[208]:
In [209]:
STANDARDIZABLE = ['num_students', 'num_schools','num_charter_schools','num_pub_schools','tcuresal_percent','pupil_teacher_ratio_dist', 'pupil_teacher_ratio_ps', 'totalrev_pp','tlocrev_pp','tsrev_pp','tfedrev_pp','tcurinst_pp','tcurssv_pp','tcuroth_pp','tcursalary_pp','tcurbenefits_pp','totalexp_pp','tcapout_pp','tnonelse_pp','tcurelsc_pp','instexp_pp','tcurinst_percent','tcuroth_percent','tcurelsc_percent','tcurssvc_percent','tfedrev_percent','tlocrev_percent','tsrev_percent','r_ELL','r_IEP','r_lunch_free','r_lunch_reduced','r_stud_PKK','r_stud_18','r_stud_912','r_stud_re_M','r_stud_re_F','r_stud_re_AIAN','r_stud_re_AAP','r_stud_re_H','r_stud_re_B','r_stud_re_W','r_stud_re_HNPI','r_stud_re_Two','r_stud_re_Total','r_stud_reg_12_AIAN_M','r_stud_reg_12_AIAN_F','r_stud_reg_12_AAP_M','r_stud_reg_12_AAP_F','r_stud_reg_12_H_M','r_stud_reg_12_H_F','r_stud_reg_12_B_M','r_stud_reg_12_B_F','r_stud_reg_12_W_M','r_stud_reg_12_W_F','r_stud_reg_12_HNPI_M','r_stud_reg_12_HNPI_F','r_stud_reg_12_Two_M','r_stud_reg_12_Two_F','r_st_PKT','r_st_KT','r_st_ET','r_st_ST','r_st_UT','r_st_TS','r_st_IA','r_st_IC','r_st_EGC','r_st_SGC','r_st_OGC','r_st_TGC','r_st_LMS','r_st_LMSS','r_st_LEA','r_st_LEASS','r_st_SA','r_st_SASS','r_st_SSSS','r_st_OSSS','r_lrev_pt','r_lrev_gst','r_lrev_put','r_lrev_it','r_lrev_aot','r_lrev_pgc','r_lrev_cc','r_lrev_oss','r_lrev_tui','r_lrev_trans','r_lrev_slr','r_lrev_ts','r_lrev_sar','r_lrev_osalserv','r_lrev_sfns','r_lrev_ie','r_lrev_molr','r_lrev_sp','r_lrev_rr','r_lrev_sale','r_lrev_ff','r_lrev_pc','r_srev_gfa','r_srev_sep','r_srev_trans','r_srev_sip','r_srev_cbsp','r_srev_vep','r_srev_codsp','r_srev_bep','r_srev_gt','r_srev_slp','r_srev_aor','r_srev_splea','r_srev_osp','r_srev_ns','r_frev_title1','r_frev_dis','r_frev_cna','r_frev_ems','r_frev_dfs','r_frev_voc','r_frev_ao','r_frev_ns','r_frev_ia','r_frev_be','r_frev_na','r_frev_aofed']
print STANDARDIZABLE
In [210]:
INDICATORS = []
for v in dftouse.columns:
l=np.unique(dftouse[v])
if len(l) <= 10:
INDICATORS.append(v)
INDICATORS.remove('RESP_High_Graduation')
INDICATORS.remove('RESP_Low_Graduation')
print INDICATORS
In [211]:
#CITATION: From HW3
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=0.7)
In [212]:
#CITATION: From HW3
mask=np.ones(dftouse.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
In [213]:
# make sure we didn't get unlucky in our mask selection
print "% Graduation in Training:", np.mean(dftouse['afgr'][mask])
print "% Graduation in Testing:", np.mean(dftouse['afgr'][~mask])
In [214]:
#CITATION: From HW3
mask.shape, mask.sum()
Out[214]:
In [215]:
#CITATION: From HW3
from sklearn.preprocessing import StandardScaler
for col in STANDARDIZABLE:
#print col
valstrain=dftouse[col].values[mask]
valstest=dftouse[col].values[~mask]
scaler=StandardScaler().fit(valstrain)
outtrain=scaler.transform(valstrain)
outtest=scaler.fit_transform(valstest)
out=np.empty(mask.shape[0])
out[mask]=outtrain
out[~mask]=outtest
dftouse[col]=out
In [216]:
#CITATION: From HW3
lcols=list(dftouse.columns)
lcols.remove('RESP_High_Graduation')
lcols.remove('RESP_Low_Graduation')
lcols.remove('afgr')
lcols.remove('gradrate0607')
lcols.remove('gradrate0708')
lcols.remove('gradrate0809')
###Check for Index Column if exixts in data and remove.
indexcol='Unnamed: 0'
if indexcol in list(dftouse.columns): lcols.remove(indexcol)
print len(lcols)
In [217]:
#CITATION: From HW3
ccols=[]
for c in lcols:
if c not in INDICATORS:
ccols.append(c)
print len(ccols), len(INDICATORS)
ccols
Out[217]:
In [218]:
Xmatrix=dftouse[lcols].values
Yresp=dftouse['afgr'].values
In [219]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
With so many features, we expect the performance of linear regression to be terrible. We include this to justify why we need to use more sophisticated techniques such as lasso and elastic net to select the key features.
In [220]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[220]:
In [221]:
print lm.intercept_
Top Factors - Positive
In [222]:
pd.DataFrame(zip(dftouse[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:25]
Out[222]:
Top Factors - Negative
In [223]:
pd.DataFrame(zip(dftouse[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:25]
Out[223]:
In [224]:
from sklearn import cross_validation
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[224]:
In [225]:
from sklearn.metrics import mean_squared_error
ypredict = lm.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Linear Regression : Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: School District Data")
plt.xlim(0,105)
#plt.ylim(0,105)
plt.show()
In [226]:
lasso = linear_model.Lasso()
In [227]:
lasso.fit(Xtrain, ytrain)
Out[227]:
In [228]:
pd.DataFrame(zip(dftouse[lcols].columns, lasso.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:10]
Out[228]:
In [229]:
pd.DataFrame(zip(dftouse[lcols].columns, lasso.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:10]
Out[229]:
r2
In [230]:
from sklearn import cross_validation
cross_validation.cross_val_score(lasso, Xtrain, ytrain, scoring='r2')
Out[230]:
In [231]:
ypredict = lasso.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Lasso Regression : Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Lasso Regression: School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
We hit a wall at actual graduation rate of 100%, with some of our predictions varying from 70% to 100%.
In [232]:
elasticnet = linear_model.ElasticNet()
elasticnet.fit(Xtrain, ytrain)
Out[232]:
In [233]:
pd.DataFrame(zip(dftouse[lcols].columns, elasticnet.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:20]
Out[233]:
In [234]:
pd.DataFrame(zip(dftouse[lcols].columns, elasticnet.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:20]
Out[234]:
In [235]:
cross_validation.cross_val_score(elasticnet, Xtrain, ytrain, scoring='r2')
Out[235]:
In [236]:
ypredict = elasticnet.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Elastic Net : Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Elastic Net Regression: School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
We hit a wall at actual graduation rate of 100% again, with predictions between 70% and 100%. Of our three models of school district data only, elastic net provides the best results.
In [368]:
dfdr=pd.read_csv("data/finaldata/dftouse.csv")
print dfdr.shape
# Create test/train mask
itrain_dr1, itest_dr1 = train_test_split(xrange(dfdr.shape[0]), train_size=0.7)
mask_dr1=np.ones(dfdr.shape[0], dtype='int')
mask_dr1[itrain_dr1]=1
mask_dr1[itest_dr1]=0
mask_dr1 = (mask_dr1==1)
# make sure we didn't get unlucky in our mask selection
print "% Graduation in Training:", np.mean(dfdr['afgr'][mask_dr1])
print "% Graduation in Testing:", np.mean(dfdr['afgr'][~mask_dr1])
# Indicators used : Funding/Expenditure/Location/School Types (no Race)
Xnames2 = [
'pupil_teacher_ratio_dist',
'totalrev_pp',
'tcurinst_pp',
'tcurssv_pp',
'tcursalary_pp',
'tcurbenefits_pp',
'totalexp_pp',
'tcapout_pp',
'tnonelse_pp',
'tcurelsc_pp',
'instexp_pp',
'i_agency_type_local_school_district',
'i_agency_type_local_school_district_sup_union',
'i_agency_type_regional_education_services',
'i_agency_type_charter_school_agency',
'i_fin_sdlc_sec',
'i_fin_sdlc_elem_sec',
'i_fin_sdlc_voc',
'i_ucl_city_large',
'i_ucl_city_mid',
'i_ucl_city_small',
'i_ucl_suburb_large',
'i_ucl_suburb_mid',
'i_ucl_suburb_small',
'i_ucl_town_fringe',
'i_ucl_town_distant',
'i_ucl_town_remote',
'i_ucl_rural_fringe',
'i_ucl_rural_distant',
'i_ucl_rural_remote',
'i_cs_all_charter',
'i_cs_charter_noncharter',
'i_cs_all_noncharter',
'i_ma_ne_nr',
'i_ma_metropolitan',
'i_ma_micropolitan',
'r_ELL',
'r_IEP',
'r_lunch_free',
'r_lunch_reduced'
]
# Target :
# Adding Target column :
dfdr['target'] = dfdr['afgr'].astype(int)
target2 = 'target'
In [370]:
# Decision Trees
from sklearn import tree
clfTree2 = tree.DecisionTreeClassifier()
parameters = {"max_depth": [10, 11, 12, 13, 14, 15], 'min_samples_leaf': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]}
clfTree2, Xtrain, ytrain, Xtest, ytest = do_classify(clfTree2, parameters, dfdr,
Xnames2, target2, 1,
mask=mask_dr1, n_jobs = 4, score_func = 'r2')
importance_list = clfTree2.feature_importances_
name_list = dfdr[Xnames2].columns
importance_list, name_list = zip(*sorted(zip(importance_list, name_list))[-15:])
plt.barh(range(len(name_list)),importance_list,align='center')
plt.yticks(range(len(name_list)),name_list)
plt.xlabel('Relative Importance in the Descision Trees 2')
plt.ylabel('Features')
plt.title('Relative importance of Each Feature')
plt.show()
In [371]:
ypredict = clfTree2.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Decision Tree : School District")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [372]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier
clfForest2 = RandomForestClassifier()
parameters = {"n_estimators": range(1, 20)}
clfForest2, Xtrain, ytrain, Xtest, ytest = do_classify(clfForest2, parameters,
dfdr, Xnames2, target2, 1, mask=mask_dr1,
n_jobs = 4, score_func='r2')
importance_list = clfForest2.feature_importances_
name_list = dfdr[Xnames2].columns
importance_list, name_list = zip(*sorted(zip(importance_list, name_list))[-15:])
plt.barh(range(len(name_list)),importance_list,align='center')
plt.yticks(range(len(name_list)),name_list)
plt.xlabel('Relative Importance in the Random Forests')
plt.ylabel('Features')
plt.title('Relative importance of Each Feature')
plt.show()
In [373]:
ypredict = clfForest2.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Random Forest : School District ")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
We now add previous years of graduation data into our models to see if they outperform the model of school district data alone. We expect that adding in previous years of graduation data will outperform the models of school district data alone.
In [238]:
dftouse.fillna(value=0,inplace=True)
As we add in previous years of data, some of those previous years do not have data for some of the school districts. Therefore, the row counts will drop.
In [239]:
dftouse = dftouse[dftouse['gradrate0607']>0]
print 'Has 0607 Grad Data: ' + str(len(dftouse))
dftouse = dftouse[dftouse['gradrate0708']>0]
print 'Has 0708 Grad Data: ' + str(len(dftouse))
dftouse = dftouse[dftouse['gradrate0809']>0]
print 'Has 0809 Grad Data: ' + str(len(dftouse))
In [240]:
dftouse.shape
Out[240]:
In [241]:
gradpred = dftouse.copy(deep=True)
In [242]:
itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=0.7)
In [243]:
#CITATION: From HW3
mask=np.ones(dftouse.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
In [244]:
# make sure we didn't get unlucky in our mask selection
print "% Graduation in Training:", np.mean(dftouse['afgr'][mask])
print "% Graduation in Testing:", np.mean(dftouse['afgr'][~mask])
In [245]:
Xmatrix=dftouse[['gradrate0607', 'gradrate0708', 'gradrate0809']].values
Yresp=dftouse['afgr'].values
In [246]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
You have multiple previous years of school district graduation data available to you, and you want to add all of them into the model to predict graduation outcome. This would be like saying, I want to predict school year 2014-2015 graduation rates, and I have 2013-2014, 2012-2013, and 2011-2012 school years of graduation rates available. This is not a realistic scenario for the general public - the latest school year of graduation rate data publically available is 2009-2010. That means there is a 5 year lapse in data availability, which would not make this regression possible to the general public if it we were the year 2009-2010.
In [247]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[247]:
In [248]:
pd.DataFrame(zip(dftouse[['gradrate0607', 'gradrate0708', 'gradrate0809']].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[248]:
In [249]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[249]:
In [250]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[250]:
In [251]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: 06-07, 07-08, 08-09 Grad Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
This model outperforms all other models created thus far. It hits the same wall at 100% graduation as the other models. As noted earlier, this is not a realistic scenario because the most recent data that would have been available during school year 2009-2010 would have been 2004-2005.
This model assumes you have the two most current previous years of data available to you.
In [252]:
Xmatrix=dftouse[['gradrate0708', 'gradrate0809']].values
Yresp=dftouse['afgr'].values
In [253]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [254]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[254]:
In [255]:
print lm.intercept_
print lm.coef_
In [256]:
pd.DataFrame(zip(dftouse[['gradrate0708', 'gradrate0809']].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[256]:
In [257]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[257]:
In [258]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[258]:
In [259]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: 07-08, 08-09 Grad Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
This model is less accurate than having 3 years of graduation data available, yet still more accurate than all of our previous models.
This model assumes you have the most recent previous year available.
In [260]:
Xmatrix=dftouse[['gradrate0809']].values
Yresp=dftouse['afgr'].values
In [261]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [262]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[262]:
In [263]:
print lm.intercept_
print lm.coef_
In [264]:
pd.DataFrame(zip(dftouse[['gradrate0809']].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[264]:
In [265]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[265]:
In [266]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[266]:
In [267]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: 0809 Grad Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
This model performs worse than the two previous. It hits a wall at both actual grad rate and predicted grad rate.
This model assumes two years prior data available only.
In [268]:
Xmatrix=dftouse[['gradrate0708']].values
Yresp=dftouse['afgr'].values
In [269]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [270]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[270]:
In [271]:
print lm.intercept_
print lm.coef_
In [272]:
pd.DataFrame(zip(dftouse[['gradrate0708']].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[272]:
In [273]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[273]:
In [274]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[274]:
In [275]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: 0708 Grad Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
This model assumes three years prior data available only.
In [276]:
Xmatrix=dftouse[['gradrate0607']].values
Yresp=dftouse['afgr'].values
In [277]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [278]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[278]:
In [279]:
print lm.intercept_
print lm.coef_
In [280]:
pd.DataFrame(zip(dftouse[['gradrate0607']].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[280]:
In [281]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[281]:
In [282]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[282]:
In [283]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: 0607 Grad Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
If you only have graduation data available for 3 years prior, the performance is the worst. Again, walls at actual and predicted 100% graduation rate. This model overpredicts, telling a school district that it is doing better than it is, which is not a good scenario.
If we married the previous years of graduation data and the school district data, could we create the best model? Is it better than just graduation rate alone?
In [284]:
lcols=list(dftouse.columns)
lcols.remove('RESP_High_Graduation')
lcols.remove('RESP_Low_Graduation')
lcols.remove('afgr')
In [285]:
Xmatrix=dftouse[lcols].values
Yresp=dftouse['afgr'].values
In [286]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [287]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[287]:
In [288]:
pd.DataFrame(zip(dftouse[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:20]
Out[288]:
In [289]:
pd.DataFrame(zip(dftouse[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:20]
Out[289]:
In [290]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[290]:
In [291]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[291]:
In [292]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: School District Data + 0607, 0708, 0809 Grad Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [293]:
lasso = linear_model.Lasso()
In [294]:
lasso.fit(Xtrain, ytrain)
Out[294]:
In [295]:
pd.DataFrame(zip(dftouse[lcols].columns, lasso.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:10]
Out[295]:
In [296]:
pd.DataFrame(zip(dftouse[lcols].columns, lasso.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:10]
Out[296]:
In [297]:
cross_validation.cross_val_score(lasso, Xtrain, ytrain, scoring='r2')
Out[297]:
In [298]:
yp = lasso.predict(Xtest)
mean_squared_error(yp, ytest)
Out[298]:
In [299]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Lasso Regression: School District Data + 0607, 0708, 0809 Grad")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [300]:
elasticnet = linear_model.ElasticNet()
elasticnet.fit(Xtrain, ytrain)
Out[300]:
In [301]:
pd.DataFrame(zip(dftouse[lcols].columns, elasticnet.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:10]
Out[301]:
In [302]:
pd.DataFrame(zip(dftouse[lcols].columns, elasticnet.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:10]
Out[302]:
In [303]:
cross_validation.cross_val_score(elasticnet, Xtrain, ytrain, scoring='r2')
Out[303]:
In [304]:
yp = elasticnet.predict(Xtest)
mean_squared_error(yp, ytest)
Out[304]:
In [305]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Elastic Net Regression: School District Data + 0607, 0708, 0809 Grad")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [306]:
# Create test/train mask
itrain_dr, itest_dr = train_test_split(xrange(df_yr.shape[0]), train_size=0.7)
mask_dr=np.ones(df_yr.shape[0], dtype='int')
mask_dr[itrain_dr]=1
mask_dr[itest_dr]=0
mask_dr = (mask_dr==1)
# Indicators used : Funding/Expenditure/Location/School Types and Race/Sex
Xnames1 = [
'pupil_teacher_ratio_dist',
'totalrev_pp',
'tcurinst_pp',
'tcurssv_pp',
'tcursalary_pp',
'tcurbenefits_pp',
'totalexp_pp',
'tcapout_pp',
'tnonelse_pp',
'tcurelsc_pp',
'instexp_pp',
'i_agency_type_local_school_district',
'i_agency_type_local_school_district_sup_union',
'i_agency_type_regional_education_services',
'i_agency_type_charter_school_agency',
'i_fin_sdlc_sec',
'i_fin_sdlc_elem_sec',
'i_fin_sdlc_voc',
'i_ucl_city_large',
'i_ucl_city_mid',
'i_ucl_city_small',
'i_ucl_suburb_large',
'i_ucl_suburb_mid',
'i_ucl_suburb_small',
'i_ucl_town_fringe',
'i_ucl_town_distant',
'i_ucl_town_remote',
'i_ucl_rural_fringe',
'i_ucl_rural_distant',
'i_ucl_rural_remote',
'i_cs_all_charter',
'i_cs_charter_noncharter',
'i_cs_all_noncharter',
'i_ma_ne_nr',
'i_ma_metropolitan',
'i_ma_micropolitan',
'r_ELL',
'r_IEP',
'r_stud_re_M',
'r_stud_re_F',
'r_stud_re_AIAN',
'r_stud_re_AAP',
'r_stud_re_H',
'r_stud_re_B',
'r_stud_re_W',
'r_stud_re_HNPI',
'r_stud_re_Two',
'gradrate0607',
'gradrate0708',
'gradrate0809',
'r_lunch_free',
'r_lunch_reduced'
]
# Adding Target column :
df_yr['target'] = df_yr['afgr'].astype(int)
target1 = 'target'
In [307]:
# Descision Trees
from sklearn import tree
clfTree1 = tree.DecisionTreeClassifier()
parameters = {"max_depth": [5, 6, 7, 9, 10, 11, 12], 'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]}
clfTree1, Xtrain, ytrain, Xtest, ytest = do_classify(clfTree1, parameters, df_yr,
Xnames1, target1, 1,
mask=mask_dr, n_jobs = 4, score_func = 'r2')
importance_list = clfTree1.feature_importances_
name_list = df_yr[Xnames1].columns
importance_list, name_list = zip(*sorted(zip(importance_list, name_list))[-15:])
plt.barh(range(len(name_list)),importance_list,align='center')
plt.yticks(range(len(name_list)),name_list)
plt.xlabel('Relative Importance in the Decision Trees')
plt.ylabel('Features')
plt.title('Relative importance of Each Feature')
plt.show()
In [308]:
ypredict = clfTree1.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Decision Tree : School District + Previous Years")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [309]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier
clfForest1 = RandomForestClassifier()
parameters = {"n_estimators": range(25, 40)}
clfForest1, Xtrain, ytrain, Xtest, ytest = do_classify(clfForest1, parameters,
df_yr, Xnames1, target1, 1, mask=mask_dr,
n_jobs = 4, score_func='r2')
importance_list = clfForest1.feature_importances_
name_list = df_yr[Xnames1].columns
importance_list, name_list = zip(*sorted(zip(importance_list, name_list))[-15:])
plt.barh(range(len(name_list)),importance_list,align='center')
plt.yticks(range(len(name_list)),name_list)
plt.xlabel('Relative Importance in the Random Forests 1')
plt.ylabel('Features')
plt.title('Relative importance of Each Feature')
plt.show()
In [310]:
ypredict = clfForest1.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Random Forest : School District + Previous Years")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [311]:
lcols=['r_stud_reg_12_W_F', 'r_stud_reg_12_W_M', 'tlocrev_percent', 'r_stud_re_W', 'r_lrev_pt', 'r_lunch_free', 'r_stud_re_B', 'tfedrev_percent', 'r_stud_18', 'tsrev_pp', 'tcuresal_percent', 'r_frev_title1', 'tcurinst_percent', 'r_lrev_molr']
In [312]:
Xmatrix=dftouse[lcols].values
Yresp=dftouse['afgr'].values
In [313]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [314]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[314]:
In [315]:
pd.DataFrame(zip(dftouse[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[315]:
In [316]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[316]:
In [317]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[317]:
In [318]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: Filtered School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
The mean squared error for the 0607 graduation rate only model and the filtered school district model are nearly identical. Therefore, if we only have school district information, its predictive ability is akin to having a graduation rate 3 years back. We could posit that in the scenario of not receiving public graduation data until 5 years later, we could use school district characteristics to predict the graduation rates 3 years in advance until such time that better data is available.
Linear Regression - 0607 Only and Best Model have very similar MSE. The 0607 Only model overpredicts. The Best Model underpredicts. Underprediction is better than overprediction in this scenario - it is disadvantageous if the school district thinks it is performing better than it is.
This brings to mind an interesting scenario. We started this project with 09-10 graduation data because it was the newest data available. Given the similarity in the 0607 Only and Best model, if all we have is school district information, it may be worthwhile checking if we can predict with the same quality as the 0607 Only model. If quality is similar, we might be able to predict 12-13 graduation rates several years before they are released to the public.
We go back to ELSI to download 0607 school district information to test this out.
In [319]:
dftouse0607 = pd.read_csv("data/finaldata/dftouse_0607_with0910grad.csv")
In [320]:
dftouse0607.shape
Out[320]:
In [321]:
STANDARDIZABLE = ['num_students', 'num_schools','num_charter_schools','num_pub_schools','tcuresal_percent','pupil_teacher_ratio_dist', 'pupil_teacher_ratio_ps', 'totalrev_pp','tlocrev_pp','tsrev_pp','tfedrev_pp','tcurinst_pp','tcurssv_pp','tcuroth_pp','tcursalary_pp','tcurbenefits_pp','totalexp_pp','tcapout_pp','tnonelse_pp','tcurelsc_pp','instexp_pp','tcurinst_percent','tcuroth_percent','tcurelsc_percent','tcurssvc_percent','tfedrev_percent','tlocrev_percent','tsrev_percent','r_ELL','r_IEP','r_lunch_free','r_lunch_reduced','r_stud_PKK','r_stud_18','r_stud_912','r_stud_re_M','r_stud_re_F','r_stud_re_AIAN','r_stud_re_AAP','r_stud_re_H','r_stud_re_B','r_stud_re_W','r_stud_re_Total','r_stud_reg_12_AIAN_M','r_stud_reg_12_AIAN_F','r_stud_reg_12_AAP_M','r_stud_reg_12_AAP_F','r_stud_reg_12_H_M','r_stud_reg_12_H_F','r_stud_reg_12_B_M','r_stud_reg_12_B_F','r_stud_reg_12_W_M','r_stud_reg_12_W_F','r_stud_reg_12_HNPI_M','r_stud_reg_12_HNPI_F','r_stud_reg_12_Two_M','r_stud_reg_12_Two_F','r_st_PKT','r_st_KT','r_st_ET','r_st_ST','r_st_UT','r_st_TS','r_st_IA','r_st_IC','r_st_EGC','r_st_SGC','r_st_TGC','r_st_LMS','r_st_LMSS','r_st_LEA','r_st_LEASS','r_st_SA','r_st_SASS','r_st_SSSS','r_st_OSSS','r_lrev_pt','r_lrev_gst','r_lrev_put','r_lrev_it','r_lrev_aot','r_lrev_pgc','r_lrev_cc','r_lrev_oss','r_lrev_tui','r_lrev_trans','r_lrev_slr','r_lrev_ts','r_lrev_sar','r_lrev_osalserv','r_lrev_sfns','r_lrev_ie','r_lrev_molr','r_lrev_sp','r_lrev_rr','r_lrev_sale','r_lrev_ff','r_lrev_pc','r_srev_gfa','r_srev_sep','r_srev_trans','r_srev_sip','r_srev_cbsp','r_srev_vep','r_srev_codsp','r_srev_bep','r_srev_gt','r_srev_slp','r_srev_aor','r_srev_splea','r_srev_osp','r_srev_ns','r_frev_title1','r_frev_dis','r_frev_cna','r_frev_ems','r_frev_dfs','r_frev_voc','r_frev_ao','r_frev_ns','r_frev_ia','r_frev_be','r_frev_na','r_frev_aofed']
print STANDARDIZABLE
In [322]:
INDICATORS = []
for v in dftouse0607.columns:
l=np.unique(dftouse0607[v])
if len(l) <= 10:
INDICATORS.append(v)
INDICATORS.remove('RESP_High_Graduation')
INDICATORS.remove('RESP_Low_Graduation')
INDICATORS.remove('Metro Status Code [District] 2009-10')
INDICATORS.remove('Gender Unknown Students [Public School] 2009-10')
print INDICATORS
In [323]:
itrain, itest = train_test_split(xrange(dftouse0607.shape[0]), train_size=0.7)
In [324]:
#CITATION: From HW3
mask=np.ones(dftouse0607.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
In [325]:
# make sure we didn't get unlucky in our mask selection
print "% Graduation in Training:", np.mean(dftouse0607['afgr'][mask])
print "% Graduation in Testing:", np.mean(dftouse0607['afgr'][~mask])
In [326]:
#CITATION: From HW3
mask
Out[326]:
In [327]:
#CITATION: From HW3
mask.shape, mask.sum()
Out[327]:
In [328]:
dftouse0607.head()
Out[328]:
In [329]:
#CITATION: From HW3
from sklearn.preprocessing import StandardScaler
for col in STANDARDIZABLE:
#print col
valstrain=dftouse0607[col].values[mask]
valstest=dftouse0607[col].values[~mask]
scaler=StandardScaler().fit(valstrain)
outtrain=scaler.transform(valstrain)
outtest=scaler.fit_transform(valstest)
out=np.empty(mask.shape[0])
out[mask]=outtrain
out[~mask]=outtest
dftouse0607[col]=out
In [330]:
dftouse0607.head()
Out[330]:
In [331]:
#CITATION: From HW3
lcols=STANDARDIZABLE+INDICATORS
print len(lcols)
In [332]:
#CITATION: From HW3
ccols=[]
for c in lcols:
if c not in INDICATORS:
ccols.append(c)
print len(ccols), len(INDICATORS)
ccols
Out[332]:
In [333]:
Xmatrix=dftouse0607[lcols].values
Yresp=dftouse0607['afgr'].values
In [334]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
Performance expected to be poor - use as a justification for lasso and elastic net.
In [335]:
import sklearn.linear_model
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[335]:
In [336]:
pd.DataFrame(zip(dftouse0607[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:25]
Out[336]:
In [337]:
pd.DataFrame(zip(dftouse0607[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:25]
Out[337]:
In [338]:
from sklearn import cross_validation
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[338]:
In [339]:
ypredict = lm.predict(Xtest)
rscore = mean_squared_error(ytest, ypredict)
print "Mean Squared Error : ", rscore
plt.scatter(ytest, ypredict, alpha=0.5)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Grad Rate vs Predicted Grad Rate: $Y_i$ vs $\hat{Y}_i$")
plt.xlim(0,105)
#plt.ylim(0,105)
plt.show()
In [340]:
lasso = linear_model.Lasso()
In [341]:
lasso.fit(Xtrain, ytrain)
Out[341]:
In [342]:
pd.DataFrame(zip(dftouse0607[lcols].columns, lasso.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:10]
Out[342]:
In [343]:
pd.DataFrame(zip(dftouse0607[lcols].columns, lasso.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:10]
Out[343]:
In [344]:
from sklearn import cross_validation
cross_validation.cross_val_score(lasso, Xtrain, ytrain, scoring='r2')
Out[344]:
In [345]:
yp = lasso.predict(Xtest)
mean_squared_error(yp, ytest)
Out[345]:
In [346]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Lasso Regression: 2006-2007 School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [347]:
elasticnet = linear_model.ElasticNet()
elasticnet.fit(Xtrain, ytrain)
Out[347]:
In [348]:
pd.DataFrame(zip(dftouse0607[lcols].columns, elasticnet.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=False)[:20]
Out[348]:
In [349]:
pd.DataFrame(zip(dftouse0607[lcols].columns, elasticnet.coef_), columns = ['features', 'estimatedCoefficients']).sort(['estimatedCoefficients'], ascending=True)[:20]
Out[349]:
In [350]:
cross_validation.cross_val_score(elasticnet, Xtrain, ytrain, scoring='r2')
Out[350]:
In [351]:
yp = elasticnet.predict(Xtest)
mean_squared_error(yp, ytest)
Out[351]:
In [352]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Elastic Net Regression: 2006-2007 School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [353]:
lcols=['r_stud_re_W','r_stud_912','r_stud_reg_12_W_M','tlocrev_percent','r_stud_re_Total','r_lunch_free','r_stud_re_B','tfedrev_percent','tfedrev_pp','r_frev_ao','r_stud_re_F','r_stud_reg_12_W_F','r_srev_gt','num_pub_schools']
In [354]:
Xmatrix=dftouse0607[lcols].values
Yresp=dftouse0607['afgr'].values
In [355]:
Xtrain=Xmatrix[mask]
Xtest=Xmatrix[~mask]
ytrain=Yresp[mask]
ytest=Yresp[~mask]
In [356]:
lm = linear_model.LinearRegression()
lm.fit(Xtrain, ytrain)
Out[356]:
In [357]:
print lm.intercept_
print lm.coef_
In [358]:
pd.DataFrame(zip(dftouse0607[lcols].columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])
Out[358]:
In [359]:
cross_validation.cross_val_score(lm, Xtrain, ytrain, scoring='r2')
Out[359]:
In [360]:
yp = lm.predict(Xtest)
mean_squared_error(yp, ytest)
Out[360]:
In [361]:
plt.scatter(ytest, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: 2006-2007 Filtered School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
In [362]:
ActualX=dftouse[lcols].values
Actualy=dftouse['afgr'].values
In [363]:
yp = lm.predict(ActualX)
mean_squared_error(yp, Actualy)
Out[363]:
In [364]:
plt.scatter(Actualy, yp, alpha=.3)
plt.xlabel("Grad Rate: Actual")
plt.ylabel("Grad Rate: Predicted")
plt.title("Linear Regression: Predicting 2009-2010 Graduation from School District Data")
plt.xlim(0,105)
plt.ylim(0,105)
plt.show()
The mean squared error is comparable to our earlier models. Absent any available graduation rate data for 5 years, one could fill in the missing data using a predictive model until such graduation rate data becomes publically available.