In [2]:
import pandas as pd
import numpy as np
import re
data_files = ["ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"]
data = {}
for f in data_files:
d = pd.read_csv("../data/schools/{0}".format(f))
data[f.replace(".csv", "")] = d
In [4]:
all_survey = pd.read_csv("../data/schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("../data/schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_10",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
In [5]:
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
In [6]:
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
In [7]:
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(np.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
In [8]:
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
In [9]:
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
In [10]:
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
In [11]:
correlations = combined.corr()
correlations = correlations["sat_score"]
correlations = correlations.dropna()
correlations.sort_values(ascending=False, inplace=True)
# Interesting correlations tend to have r value > .25 or < -.25
interesting_correlations = correlations[abs(correlations) > 0.25]
print(interesting_correlations)
In [12]:
# Setup Matplotlib to work in Jupyter notebook
%matplotlib inline
import matplotlib.pyplot as plt
In [13]:
# Make a bar plot of the correlations between survey fields and sat_score
correlations[survey_fields].plot.bar(figsize=(9,7))
Out[13]:
From the survey fields, two stand out due to their significant positive correlations:
Why are some possible reasons that N_s and N_p could matter?
In [14]:
# Make a scatterplot of the saf_s_11 column vs the sat-score in combined
combined.plot.scatter(x='sat_score', y='saf_s_11', figsize=(9,5))
Out[14]:
So a high saf_s_11 student safety and respect score doesn't really have any predictive value regarding SAT score. However, a low saf_s_11 has a very strong correlation with low SAT scores.
In [15]:
# Find the average values for each column for each school_dist in combined
districts = combined.groupby('school_dist').agg(np.mean)
# Reset the index of districts, making school_dist a column again
districts.reset_index(inplace=True)
In [16]:
# Make a map that shows afety scores by district
from mpl_toolkits.basemap import Basemap
plt.figure(figsize=(8,8))
# Setup the Matplotlib Basemap centered on New York City
m = Basemap(projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i')
m.drawmapboundary(fill_color='white')
m.drawcoastlines(color='blue', linewidth=.4)
m.drawrivers(color='blue', linewidth=.4)
# Convert the lat and lon columns of districts to lists
longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()
# Plot the locations
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True,
c=districts['saf_s_11'], cmap='summer')
# Add colorbar
# add colorbar.
cbar = m.colorbar(location='bottom',pad="5%")
cbar.set_label('saf_s_11')
So it looks like the safest schools are in Manhattan, while the least safe schools are in Brooklyn.
This jives with crime statistics by borough
In [17]:
# Make a plot of the correlations between racial cols and sat_score
race_cols = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
race_corr = correlations[race_cols]
race_corr.plot(kind='bar')
Out[17]:
A higher percentage of white and asian students correlates positively with SAT scores and a higher percentage of black or hispanic students correlates negatively with SAT scores. I wouldn't say any of this is suprising. My guess would be that there is an underlying economic factor which is the cause - white and asian neighborhoods probably have a higher median household income and more well funded schools than black or hispanic neighborhoods.
In [18]:
# Explore schools with low SAT scores and a high hispanic_per
combined.plot.scatter(x='hispanic_per', y='sat_score')
Out[18]:
The above scatterplot shows that a low hispanic percentage isn't particularly predictive of SAT score. However, a high hispanic percentage is highly predictive of a low SAT score.
In [19]:
# Research any schools with a greater than 95% hispanic_per
high_hispanic = combined[combined['hispanic_per'] > 95]
# Find the names of schools from the data
high_hispanic['SCHOOL NAME']
Out[19]:
The above schools appear to contain a lot of international schools focused on recent immigrants who are learning English as a 2nd language. It makes sense that they would have a harder time on the SAT which is given soley in English.
In [20]:
# Research any schools with less than 10% hispanic_per and greater than
# 1800 average SAT score
high_sat_low_hispanic = combined[(combined['hispanic_per'] < 10) &
(combined['sat_score'] > 1800)]
high_sat_low_hispanic['SCHOOL NAME']
Out[20]:
Most of the schools above appear to be specialized science and technology schools which receive extra funding and require students to do well on a standardized test before being admitted. So it is reasonable that students at these schools would have a high average SAT score.
In [21]:
# Investigate gender differences in SAT scores
gender_cols = ['male_per', 'female_per']
gender_corr = correlations[gender_cols]
gender_corr
Out[21]:
In [22]:
# Make a plot of the gender correlations
gender_corr.plot.bar()
Out[22]:
In the plot above, we can see that a high percentage of females at a school positively correlates with SAT score, whereas a high percentage of males at a school negatively correlates with SAT score. Neither correlation is extremely strong.
More data would be required before I was wiling to say that this is a significant effect.
In [23]:
# Investigate schools with high SAT scores and a high female_per
combined.plot.scatter(x='female_per', y='sat_score')
Out[23]:
The above plot appears to show that either very low or very high percentage of females in a school leads to a low average SAT score. However, a percentage in the range 40 to 80 or so can lead to good scores. There doesn't appear to be a strong overall correlation.
In [24]:
# Research any schools with a greater than 60% female_per, and greater
# than 1700 average SAT score.
high_female_high_sat = combined[(combined['female_per'] > 60) &
(combined['sat_score'] > 1700)]
high_female_high_sat['SCHOOL NAME']
Out[24]:
These schools appears to be very selective liberal arts schools that have high academic standards.
The Advanced Placement (AP) exams are exams that high schoolers take in order to gain college credit. AP exams can be taken in many different subjects, and passing the AP exam means that colleges may grant you credits.
It makes sense that the number of students who took the AP exam in a school and SAT scores would be highly correlated. Let's dig into this relationship more.
Since total_enrollment is highly correlated with sat_score, we don't want to bias our results, so we'll instead look at the percentage of students in each school who took at least one AP exam.
In [25]:
# Compute the percentage of students in each school that took the AP exam
combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']
In [26]:
# Investigate the relationship between AP scores and SAT scores
combined.plot.scatter(x='ap_per', y='sat_score')
Out[26]:
It looks like there is a relationship between the percentage of students in a school who take the AP exam, and their average SAT scores. It's not an extremely strong correlation, though.
I"m really surprised this relationship isn't stronger. This is rather counter-intuitive.
There is still quite a bit of analysis left to do. Here are some potential next steps
In [ ]: