nyc-schools_C

This script averages the ACS variables for the N census tracts closest to each school, and combines these averaged variables with the school outcomes in a single dataframe (saved as a *.csv)


In [17]:
import pandas as pd
import numpy as np
import os

bp_data = '/Users/bryanfry/projects/proj_nyc-schools/data_files'
n_tracts = 10  # Average ACS variable from 20 closest tracts to each school.

Function to compute the average value of an ACS variable across several census tracts


In [18]:
# Compute average value for ACS var, given a list of geoid.  Ideally perhaps the tracts should
# be weighted by population rather than using a simple mean, but probably results won't be
# much different since the census tracts are intended to have roughly equal populations.

def calc_multitract_var (df_acs, var, geoid_list, mode = 'sum'):
    t = 0 # Total value
    #print geoid_list.tolist()
    for g in geoid_list:
        #print g
        try:
            t = t + float (df_acs[df_acs.GEOID == g][var])
        except: pass
    if mode == 'avg':
        t = t / len (geoid_list)
    return t

MAIN


In [19]:
# Load school data (with 50 closest census tracts), and ACS variables for each tract
df_sch = pd.read_csv (os.path.join (bp_data, 'df_A_school_info.csv'))
df_acs = pd.read_csv (os.path.join (bp_data, 'df_B_acs_geoid.csv'))

# Drop first column of each imported dataframe (these are just redundent indices)
df_sch = df_sch.drop (df_sch.columns[0], axis = 1)
df_acs = df_acs.drop (df_acs.columns[0], axis = 1)

In [20]:
df_acs.head()


Out[20]:
GEOID BOR 2+_RACES ASIAN BLACK DIFFERENT_HOUSE DIFFERENT_HOUSE_ABROAD DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE DIFFERENT_HOUSE_SAME_CITY DIFFERENT_HOUSE_US_DIFFERENT_STATE ... NATIVE_CITIZEN NON_CITIZEN SAME_HOUSE TOTAL_POP? WHITE FRAC_MINORITY RENT_INCOME_RATIO FRAC_MOVED FRAC_NONCITIZEN FRAC_FOREIN_BORN
0 36005000100 BNX 71.0 167.0 4851.0 6710.0 0.0 194.0 6238.0 278.0 ... 6980.0 1081.0 1720.0 8430.0 1184.0 0.604152 1.862186 0.795967 0.128233 0.172005
1 36005000200 BNX 183.0 331.0 1423.0 304.0 1.0 55.0 232.0 17.0 ... 3536.0 558.0 4755.0 5095.0 1606.0 0.381551 0.624364 0.059666 0.109519 0.305986
2 36005000400 BNX 50.0 36.0 1996.0 450.0 0.0 32.0 409.0 9.0 ... 4385.0 383.0 5089.0 5572.0 1674.0 0.376525 0.573522 0.080761 0.068737 0.213029
3 36005001600 BNX 38.0 286.0 1806.0 275.0 69.0 8.0 258.0 9.0 ... 3733.0 744.0 4989.0 5412.0 1855.0 0.393570 0.707188 0.050813 0.137472 0.310237
4 36005001900 BNX 47.0 39.0 814.0 539.0 5.0 14.0 456.0 69.0 ... 1801.0 614.0 1999.0 2569.0 828.0 0.350331 0.894045 0.209809 0.239004 0.298949

5 rows × 25 columns

Now loop on the schools, and average ACS variables across census tracts


In [21]:
# Define a dictionary with the census variables to be added to the dataframe
dict_var = {}
acs_col_list = df_acs.columns[2:]  # These are the census variables of interest

# Loop on the rows of the school file.
for c in acs_col_list:
    dict_var [c] = []   # Make an empty list for each column.  
                        # One element will be added to each list in 
                        # the dictionary for each school# For variables which are either FRACTIONS or MEDIAN VALUES, we take the 
# MEAN across the tracts.  For other values (corresponging to actual number of 
# respondants) we take the SUM.

for i in range (0, len (df_sch)):
    geoid_list= df_sch.ix [i][9:9+n_tracts]
    for i, c in enumerate (acs_col_list):
        if i in [9, 10, 11, 18, 19, 20, 21, 22]: mode = 'avg'
        else: mode = 'sum'
        dict_var[c].append (calc_multitract_var (df_acs, var = c, geoid_list=geoid_list, mode = mode))

df_tract_avg = pd.DataFrame(data = dict_var)   
df_tract_avg.head()


Out[21]:
2+_RACES ASIAN BLACK DIFFERENT_HOUSE DIFFERENT_HOUSE_ABROAD DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE DIFFERENT_HOUSE_SAME_CITY DIFFERENT_HOUSE_US_DIFFERENT_STATE FOREIGN_BORN_INCLUDING_NATURALIZED FRAC_FOREIN_BORN ... MEDIAN_AGE MEDIAN_INCOME MEDIAN_MONTHLY_HOUSING_COSTS NATIVE_AMERICAN NATIVE_CITIZEN NON_CITIZEN RENT_INCOME_RATIO SAME_HOUSE TOTAL_POP? WHITE
0 344.0 893.0 12406.0 1252.0 185.0 50.0 1180.0 22.0 6941.0 0.426406 ... 36.44 25765.4 1343.6 100.0 9036.0 2888.0 0.641336 14214.0 16193.0 1452.0
1 218.0 1358.0 3818.0 1853.0 181.0 49.0 1671.0 133.0 7770.0 0.366035 ... 36.36 25270.2 1155.8 55.0 12243.0 4581.0 0.556566 17749.0 20013.0 8233.0
2 464.0 547.0 7315.0 3422.0 174.0 63.0 3233.0 126.0 8331.0 0.307253 ... 29.28 14537.0 898.0 30.0 17741.0 5358.0 0.765690 22121.0 26072.0 3313.0
3 477.0 774.0 4355.0 4425.0 377.0 477.0 3175.0 773.0 7687.0 0.336043 ... 26.14 11822.4 1063.6 110.0 14542.0 5529.0 1.334316 17105.0 22229.0 6731.0
4 191.0 495.0 10548.0 1188.0 182.0 29.0 1022.0 137.0 4981.0 0.342813 ... 34.52 24536.6 1281.8 154.0 10037.0 2031.0 0.624219 13414.0 15018.0 1054.0

5 rows × 23 columns

Concatenate the tract-averaged data with the school outcome data


In [22]:
df = pd.concat ([df_sch, df_tract_avg], axis = 1)

In [23]:
df.head()


Out[23]:
NAME DBN STREET ZIPCODE LAT LON COUNTY HOOD DISPLAY_NAME GEOCODE00 ... MEDIAN_AGE MEDIAN_INCOME MEDIAN_MONTHLY_HOUSING_COSTS NATIVE_AMERICAN NATIVE_CITIZEN NON_CITIZEN RENT_INCOME_RATIO SAME_HOUSE TOTAL_POP? WHITE
0 Academy for Scholarship and Entrepreneurship: ... 11X270 921 East 228th Street 10466 40.888215 -73.852720 Bronx County Wakefield 921, East 228th Street, Wakefield, Bronx, Bron... 36005040400 ... 36.44 25765.4 1343.6 100.0 9036.0 2888.0 0.641336 14214.0 16193.0 1452.0
1 Astor Collegiate Academy 11X299 925 Astor Avenue 10469 40.859900 -73.860322 Bronx County Morris Park Christopher Columbus High School, 925, Astor A... 36005032400 ... 36.36 25270.2 1155.8 55.0 12243.0 4581.0 0.556566 17749.0 20013.0 8233.0
2 Banana Kelly High School 08X530 965 Longwood Avenue 10459 40.817601 -73.897985 Bronx County Melrose 965, Longwood Avenue, Melrose, Bronx, Bronx Co... 36005008500 ... 29.28 14537.0 898.0 30.0 17741.0 5358.0 0.765690 22121.0 26072.0 3313.0
3 Belmont Preparatory High School 10X434 500 East Fordham Road 10458 40.859840 -73.888295 Bronx County Belmont 500, East Fordham Road, Belmont, Bronx, Bronx ... 36005038700 ... 26.14 11822.4 1063.6 110.0 14542.0 5529.0 1.334316 17105.0 22229.0 6731.0
4 Bronx Academy of Health Careers 11X290 800 East Gun Hill Road 10467 40.875549 -73.861423 Bronx County Williams Bridge 800, East Gun Hill Road, Williams Bridge, Bron... 36005037200 ... 34.52 24536.6 1281.8 154.0 10037.0 2031.0 0.624219 13414.0 15018.0 1054.0

5 rows × 115 columns

Finally clean up some of column names, and eliminate some that will not be used


In [24]:
df_c = pd.DataFrame() # c -> 'concise'

# Build list of columns to copy
c_list = ['NAME','DBN','STREET','ZIPCODE','LAT','LON','COUNTY','HOOD','DISPLAY_NAME']
c_list = c_list + ['GEOCODE' + str (i).zfill(2) for i in range (0, n_tracts)]
c_list = c_list + ['2+_RACES','ASIAN','BLACK','DIFFERENT_HOUSE','DIFFERENT_HOUSE_ABROAD',\
          'DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE','DIFFERENT_HOUSE_SAME_CITY',\
          'DIFFERENT_HOUSE_US_DIFFERENT_STATE','FOREIGN_BORN_INCLUDING_NATURALIZED',\
          'MEDIAN_AGE','MEDIAN_INCOME','MEDIAN_MONTHLY_HOUSING_COSTS','NATIVE_AMERICAN',\
          'NATIVE_CITIZEN','NON_CITIZEN','SAME_HOUSE','TOTAL_POP?','WHITE','FRAC_MINORITY',\
          'RENT_INCOME_RATIO','FRAC_MOVED','FRAC_NONCITIZEN','FRAC_FOREIN_BORN']
for c in c_list: df_c[c] = df[c]


# Copy and rename school outcome data
old_c_list = ['Total Cohort','Total Grads - % of cohort',\
              'Total Regents - % of cohort','Total Regents - % of grads','Advanced Regents - % of cohort',\
              'Advanced Regents - % of grads','Regents w/o Advanced - % of cohort',\
              'Regents w/o Advanced - % of grads','Local - % of cohort','Local - % of grads',\
              'Dropped Out - % of cohort','Q_Total Grads - % of cohort','Q_Total Regents - % of cohort',\
              'Q_Total Regents - % of grads','Q_Advanced Regents - % of cohort',\
              'Q_Advanced Regents - % of grads','Q_Regents w/o Advanced - % of cohort','Q_Local - % of cohort',\
              'Q_Local - % of grads','Q_Still Enrolled - % of cohort','Q_Dropped Out - % of cohort']

new_c_list = ['TOTAL_COHORT','GRADS_%','REGENTS_%_COHORT','REGENTS_%_GRADS'\
              ,'ADV_REGENTS_%_COHORT','ADV_REGENTS_%_GRADS','REG_REGENTS_%_COHORT','REG_REGENTS_%_GRADS'\
              ,'LOCAL_%_COHORT','LOCAL_%_GRADS','DROPPED_OUT_%','Q_GRADS_%',\
              'Q_REGENTS_%_COHORT','Q_REGENTS_%_GRADS','Q_ADV_REGENTS_%_COHORT',\
              'Q_ADV_REGENTS_%_GRADS','Q_REG_REGENTS_%_COHORT','Q_LOCAL_%_COHORT',\
              'Q_LOCAL_%_GRADS','Q_STILL_ENROLLED_%','Q_DROPPED_OUT_%']

for old_c, new_c in zip (old_c_list, new_c_list):
    df_c[new_c] = df[old_c]
    

#There are some empties -- drop rows with NaN
df_c = df_c.dropna()

# Save the 'concise' dataframe
fp_out = os.path.join (bp_data, 'df_C_sch_acs_NTract=' + str (n_tracts).zfill(2) + '.csv')
df_c.to_csv (fp_out)

In [ ]:


In [ ]: