In [17]:
import pandas as pd
import numpy as np
import os
bp_data = '/Users/bryanfry/projects/proj_nyc-schools/data_files'
n_tracts = 10 # Average ACS variable from 20 closest tracts to each school.
In [18]:
# Compute average value for ACS var, given a list of geoid. Ideally perhaps the tracts should
# be weighted by population rather than using a simple mean, but probably results won't be
# much different since the census tracts are intended to have roughly equal populations.
def calc_multitract_var (df_acs, var, geoid_list, mode = 'sum'):
t = 0 # Total value
#print geoid_list.tolist()
for g in geoid_list:
#print g
try:
t = t + float (df_acs[df_acs.GEOID == g][var])
except: pass
if mode == 'avg':
t = t / len (geoid_list)
return t
In [19]:
# Load school data (with 50 closest census tracts), and ACS variables for each tract
df_sch = pd.read_csv (os.path.join (bp_data, 'df_A_school_info.csv'))
df_acs = pd.read_csv (os.path.join (bp_data, 'df_B_acs_geoid.csv'))
# Drop first column of each imported dataframe (these are just redundent indices)
df_sch = df_sch.drop (df_sch.columns[0], axis = 1)
df_acs = df_acs.drop (df_acs.columns[0], axis = 1)
In [20]:
df_acs.head()
Out[20]:
In [21]:
# Define a dictionary with the census variables to be added to the dataframe
dict_var = {}
acs_col_list = df_acs.columns[2:] # These are the census variables of interest
# Loop on the rows of the school file.
for c in acs_col_list:
dict_var [c] = [] # Make an empty list for each column.
# One element will be added to each list in
# the dictionary for each school# For variables which are either FRACTIONS or MEDIAN VALUES, we take the
# MEAN across the tracts. For other values (corresponging to actual number of
# respondants) we take the SUM.
for i in range (0, len (df_sch)):
geoid_list= df_sch.ix [i][9:9+n_tracts]
for i, c in enumerate (acs_col_list):
if i in [9, 10, 11, 18, 19, 20, 21, 22]: mode = 'avg'
else: mode = 'sum'
dict_var[c].append (calc_multitract_var (df_acs, var = c, geoid_list=geoid_list, mode = mode))
df_tract_avg = pd.DataFrame(data = dict_var)
df_tract_avg.head()
Out[21]:
In [22]:
df = pd.concat ([df_sch, df_tract_avg], axis = 1)
In [23]:
df.head()
Out[23]:
In [24]:
df_c = pd.DataFrame() # c -> 'concise'
# Build list of columns to copy
c_list = ['NAME','DBN','STREET','ZIPCODE','LAT','LON','COUNTY','HOOD','DISPLAY_NAME']
c_list = c_list + ['GEOCODE' + str (i).zfill(2) for i in range (0, n_tracts)]
c_list = c_list + ['2+_RACES','ASIAN','BLACK','DIFFERENT_HOUSE','DIFFERENT_HOUSE_ABROAD',\
'DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE','DIFFERENT_HOUSE_SAME_CITY',\
'DIFFERENT_HOUSE_US_DIFFERENT_STATE','FOREIGN_BORN_INCLUDING_NATURALIZED',\
'MEDIAN_AGE','MEDIAN_INCOME','MEDIAN_MONTHLY_HOUSING_COSTS','NATIVE_AMERICAN',\
'NATIVE_CITIZEN','NON_CITIZEN','SAME_HOUSE','TOTAL_POP?','WHITE','FRAC_MINORITY',\
'RENT_INCOME_RATIO','FRAC_MOVED','FRAC_NONCITIZEN','FRAC_FOREIN_BORN']
for c in c_list: df_c[c] = df[c]
# Copy and rename school outcome data
old_c_list = ['Total Cohort','Total Grads - % of cohort',\
'Total Regents - % of cohort','Total Regents - % of grads','Advanced Regents - % of cohort',\
'Advanced Regents - % of grads','Regents w/o Advanced - % of cohort',\
'Regents w/o Advanced - % of grads','Local - % of cohort','Local - % of grads',\
'Dropped Out - % of cohort','Q_Total Grads - % of cohort','Q_Total Regents - % of cohort',\
'Q_Total Regents - % of grads','Q_Advanced Regents - % of cohort',\
'Q_Advanced Regents - % of grads','Q_Regents w/o Advanced - % of cohort','Q_Local - % of cohort',\
'Q_Local - % of grads','Q_Still Enrolled - % of cohort','Q_Dropped Out - % of cohort']
new_c_list = ['TOTAL_COHORT','GRADS_%','REGENTS_%_COHORT','REGENTS_%_GRADS'\
,'ADV_REGENTS_%_COHORT','ADV_REGENTS_%_GRADS','REG_REGENTS_%_COHORT','REG_REGENTS_%_GRADS'\
,'LOCAL_%_COHORT','LOCAL_%_GRADS','DROPPED_OUT_%','Q_GRADS_%',\
'Q_REGENTS_%_COHORT','Q_REGENTS_%_GRADS','Q_ADV_REGENTS_%_COHORT',\
'Q_ADV_REGENTS_%_GRADS','Q_REG_REGENTS_%_COHORT','Q_LOCAL_%_COHORT',\
'Q_LOCAL_%_GRADS','Q_STILL_ENROLLED_%','Q_DROPPED_OUT_%']
for old_c, new_c in zip (old_c_list, new_c_list):
df_c[new_c] = df[old_c]
#There are some empties -- drop rows with NaN
df_c = df_c.dropna()
# Save the 'concise' dataframe
fp_out = os.path.join (bp_data, 'df_C_sch_acs_NTract=' + str (n_tracts).zfill(2) + '.csv')
df_c.to_csv (fp_out)
In [ ]:
In [ ]: