This notebook performs the following functions:
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import vincenty
from bs4 import BeautifulSoup
import os, string, requests, re, pickle
%matplotlib inline
bp_data = '/Users/bryanfry/projects/proj_nyc-schools/data_files' #basepath for input files
In [2]:
# Function to find all occurrences of the string 'Contact Information' in the doc.
# The indices of these lines are returned, and serve as starting point to locate
# addresses.
def locate_CI_lines (content):
CI_line_indices = []
for i, line in enumerate (content):
if 'Contact Information' in line: CI_line_indices.append (i)
return CI_line_indices
# Function parses the information for a single public high school in the guide.
# Ingests 'content' (all text in the public school section of the guide), and the
# line number with the contact info ('ci') for a single school.
def parse_school_info (content, CI_line_index):
# Parse the line after 'Contact Information'.
# This contatins the name and the DBN information (DBN is unique ID code for school)
line = content [CI_line_index-1]
i = string.find (line, 'DBN')
name = line [0:i-5]
DBN = line [i+4:-1]
# Starting at the 'Contact' line, we look BACKWARDS to find first previous line with DBN.
# This contains both the school name and DBN code.
DBN_line_index = CI_line_index -1
while not 'DBN' in content[DBN_line_index]:
DBN_line_index -= 1
line = content [DBN_line_index]
i = string.find (line, 'DBN')
name = line [0:i-5]
DBN = line [i+4:-1]
# Starting at the 'Contact' line, we need to locate the next line containing
# string 'Address:' This is not always a fixed # of lines after the
# DBN line.
addy_line_index = CI_line_index +1 # Number of lines to look ahead of DBN line for 'Address:'
while not 'Address' in content[addy_line_index]:
addy_line_index += 1
#Get street address and zipcode
street = content [addy_line_index][9:-1]
zipcode = content [addy_line_index+1][-6:-1]
return name, DBN, street, zipcode
In [3]:
# Function to get the lat / lon and neighborhood, given street address and zipcode
# This uses the free openstreetmap API for geolocation!
# It typically takes ~ 4 min to work on all the schools.
def get_coords_and_hood (street, zipcode):
zipcode = str (zipcode).zfill(5)
try:
street = street.replace (' ','+')
s = 'http://nominatim.openstreetmap.org/search?format=xml'
s = s + '&street=' + street
s = s + '&postalcode=' + zipcode
s = s + '&addressdetails=1'
r = requests.get (s)
soup = BeautifulSoup (r.content, 'lxml')
lat = float (soup.place.attrs['lat'])
lon = float (soup.place.attrs['lon'])
county = soup.place.county.contents[0]
hood = soup.place.neighbourhood.contents[0]
display_name = soup.place.attrs['display_name']
except:
lat, lon, county, hood, display_name = None, None, None, None, None
return lat, lon, county, hood, display_name
# Given a specific input geocode and a dictionary of location strings ('tags'), each
# with lat/lon, this function returns the location in the dictionary closest to the
# input geocode. It uses 'vincenty distance' which accounts for global curvature to
# compute distance. (Probably a simpler and faster 'flat earth' calc. would work just as
# well given that all locations are within only a couple degrees.)
def find_closest_loc (loc_dict, loc_query):
dist_dict = {}
for k, loc in loc_dict.items(): # Loop over the dictionary entries and compute distance to each one,
# populating the new dictionary dist_dict
dist_dict[k] = vincenty (loc, loc_query).meters
min_loc = min (dist_dict, key=dist_dict.get)
min_dist = dist_dict [min_loc]
return min_loc, min_dist
# Given dictionary of tags and corresponding lat/lon tuples, find N locations in the dictionary
# closest to the input loc_query
def find_closest_n_loc (loc_dict, loc_query, n):
dist_dict = {}
for k, loc in loc_dict.items():
dist_dict[k] = vincenty (loc, loc_query).meters
loc_sorted = sorted (dist_dict, key=dist_dict.get)
dist_sorted = sorted (dist_dict.values())
return loc_sorted[0:n], dist_sorted[0:n]
In [4]:
# Function to remove % from percentage strings, and return a number
def proc_percent (s):
try:
return float (s.replace ('%', ''))
except:
return np.nan
# Function to add jitter -- this addresses problems of equal min/ max for
# bins in quintile calculation and in plotting histograms.
def add_jitter (x, amp = 0.00001):
return x + (np.random.random (size = len (x)) * amp)
#########################################
# Split data into quintiles
# NOTE: qcut will return error if data has non-unique edges (ex. more than 20% of data is 0%)
# If qcut throws an error, we bypass this issue by adding a trivial amount of positive
# noise to each value. Cheesy but works fine.
def quantile(column, quantile=5):
try:
q = pd.qcut(column, quantile)
except: #Error -- add a little noise
column = add_jitter (column)
q = pd.qcut (column, quantile)
return [i+1 for i in q.codes]
########################################
# This function calculates quintiles for a set of desired columns on each
# school. It also combines the outcome dataframe with the 20 nearest census tract
# set for each school by merging the two dataframes on DBN.
def combine_data (df_outcomes, df_tract, percent_col_list):
df = df_outcomes[df_outcomes.Cohort == '2006'] # Limit to 2006
df = pd.merge (df_tract, df, how = 'inner', on = 'DBN') # Perform join on the DBNs
for c in percent_col_list:
df [c] = df [c].apply (proc_percent)
#On each of the 'interesting' percent_col_list, compute the quantiles
for c in percent_col_list:
c_Q = 'Q_' + c
df [c_Q] = quantile (df[c].tolist())
return df
In [5]:
# This wraps the Matplotlib hist function to do NaN-removal, and add
# plot title and axis labels.
def plot_histogram (x, n_bins, title='', x_label='', y_label='', color = None):
# First, use pandas or numpy to remvoe NaNs from the data. The
# presence of NaN may cause the matplotlib histogram to fail.
try:
x = x.dropna() # Will work if x is a pandas DataFrame or Series
except:
x = np.array (x)[~np.isnan (x)] # Remove using numpy functions
plt.figure()
plt.hist (x, color = color)
plt.title (title)
plt.xlabel(x_label)
plt.ylabel(y_label)
### TEST HISTOGRAM ###
x = np.random.normal(size = 1000)
plot_histogram (x, 20, 'Test - Dummy','X_Label','Y_Label', 'Maroon')
In [6]:
fp_hs_guide = os.path.join (bp_data, 'NY_Public_High_school_guide_FROM_PDF.txt')
fp_tract_centroids = os.path.join (bp_data, 'tract_centroids.txt')
#fp_hs_addresses = os.path.join (bp_data, 'HS_Addresses.csv')
with open(fp_hs_guide) as f:
content = f.readlines()
CI_index_list = locate_CI_lines (content) # Find locations of the text 'Contact Information'
# Build list of physical addresses.
# Each list element is a tuple with name, DBN, street address, and zipcode.
school_loc_list = [parse_school_info(content, i) for i in CI_index_list]
In [7]:
school_geocode_list = [get_coords_and_hood (i[2], i[3]) for i in school_loc_list] # ~ 4 min
In [8]:
df_loc = pd.DataFrame()
df_loc['NAME'] = [i[0] for i in school_loc_list]
df_loc['DBN'] = [i[1] for i in school_loc_list]
df_loc['STREET'] = [i[2] for i in school_loc_list]
df_loc['ZIPCODE'] = [i[3] for i in school_loc_list]
df_loc['LAT'] = [i[0] for i in school_geocode_list]
df_loc['LON'] = [i[1] for i in school_geocode_list]
df_loc['COUNTY'] = [i[2] for i in school_geocode_list]
df_loc['HOOD'] = [i[3] for i in school_geocode_list]
df_loc['DISPLAY_NAME'] = [i[4] for i in school_geocode_list]
df_loc = df_loc.dropna()
This is done by loading a file from the American Community Survey that contains the centroid (lat/lon) of each census tract. We then calculate Vincenty distance from each school to the centroids of each tract, and take the tract with the shortest distance. This method is not exact (it assumes uniformly shaped tracts), but it is pretty close and should always result in indentifying at least with a CLOSE census tract. Code takes ~1 min to run (Vincenty distance is slow)
In [9]:
# Load the file from ACS with centroids for each tract
fp_tract_centroids = os.path.join (bp_data, 'tract_centroids.txt')
df_tracts = pd.read_csv (fp_tract_centroids)
df_tracts = df_tracts.drop ([df_tracts.columns[0]], axis = 1) # Drop first column, unused
# Build dictionary of GEOIDS (keys) and Lat / Lon tuples (values)
tract_dict = {df_tracts.GEOID[i]: (df_tracts.LAT[i], df_tracts.LON[i]) \
for i in range (0,len(df_tracts))}
# Assign the 50 closest tracts to each school, based on centroid distances
tract_list, dist_list = [],[] # Each element in these lists will be another n-element list
n = 50 # Use 20 closest tract centroids
for lat, lon, name in zip (df_loc.LAT.tolist(), df_loc.LON.tolist(), df_loc.NAME):
loc_query = [lat, lon]
tracts, dists = find_closest_n_loc (tract_dict, loc_query, n)
tract_list.append (tracts) # Append a 20-element list to the list-of-lists
dist_list.append (dists) # Append a 20-element list to the list-of-lists
# Add the geocode (name) of the 20 closest tracts to the dataframe
tract_array = np.array (tract_list)
col_names_tract = ['GEOCODE' + str(i).zfill(2) for i in range (0,n)]
for i in range (n):
df_loc ['GEOCODE' + str(i).zfill(2)] = tract_array[:,i]
df_loc.head()
Out[9]:
In [10]:
# Load file with school outcomes
df_sch_outcomes = pd.read_csv (os.path.join (bp_data, 'Graduation_Outcomes.csv'))
# list of columns given as percentages.
# We will compute quintiles for each of these.
percent_col_list = ['Total Grads - % of cohort', \
'Total Regents - % of cohort',\
'Total Regents - % of grads',\
'Advanced Regents - % of cohort',\
'Advanced Regents - % of grads',\
'Regents w/o Advanced - % of cohort',\
'Regents w/o Advanced - % of grads',\
'Local - % of cohort',\
'Local - % of grads',\
'Still Enrolled - % of cohort',\
'Dropped Out - % of cohort']\
# expand the dataframe on to include quintile on the 'interesting' school stats
df = combine_data (df_sch_outcomes[df_sch_outcomes.Demographic == 'Total Cohort'], \
df_loc, percent_col_list)
# There are some schools with no data. dropna () to get rid of them
df = df.dropna()
In [11]:
df.to_csv (os.path.join (bp_data, 'df_A_school_info.csv'))
In [12]:
for c in percent_col_list [0:6]:
plot_histogram (df[c].tolist(), n_bins = 5, title = c, \
x_label = 'Percent', y_label = '# Schools', color = 'Maroon' )
In [13]:
for c in percent_col_list [6:]:
plot_histogram (df[c].tolist(), n_bins = 5, title = c, \
x_label = 'Percent', y_label = '# Schools', color = 'Maroon' )
In [14]:
df.head()
Out[14]:
In [15]:
1
Out[15]:
In [ ]: