nyc-schools_C

This script averages the ACS variables for the N census tracts closest to each school, and combines these averaged variables with the school outcomes in a single dataframe (saved as a *.csv)



In [17]:

    
import pandas as pd
import numpy as np
import os

bp_data = '/Users/bryanfry/projects/proj_nyc-schools/data_files'
n_tracts = 10  # Average ACS variable from 20 closest tracts to each school.

Function to compute the average value of an ACS variable across several census tracts



In [18]:

    
# Compute average value for ACS var, given a list of geoid.  Ideally perhaps the tracts should
# be weighted by population rather than using a simple mean, but probably results won't be
# much different since the census tracts are intended to have roughly equal populations.

def calc_multitract_var (df_acs, var, geoid_list, mode = 'sum'):
    t = 0 # Total value
    #print geoid_list.tolist()
    for g in geoid_list:
        #print g
        try:
            t = t + float (df_acs[df_acs.GEOID == g][var])
        except: pass
    if mode == 'avg':
        t = t / len (geoid_list)
    return t

MAIN



In [19]:

    
# Load school data (with 50 closest census tracts), and ACS variables for each tract
df_sch = pd.read_csv (os.path.join (bp_data, 'df_A_school_info.csv'))
df_acs = pd.read_csv (os.path.join (bp_data, 'df_B_acs_geoid.csv'))

# Drop first column of each imported dataframe (these are just redundent indices)
df_sch = df_sch.drop (df_sch.columns[0], axis = 1)
df_acs = df_acs.drop (df_acs.columns[0], axis = 1)



In [20]:

    
df_acs.head()









    Out[20]:






  
    
      
      GEOID
      BOR
      2+_RACES
      ASIAN
      BLACK
      DIFFERENT_HOUSE
      DIFFERENT_HOUSE_ABROAD
      DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE
      DIFFERENT_HOUSE_SAME_CITY
      DIFFERENT_HOUSE_US_DIFFERENT_STATE
      ...
      NATIVE_CITIZEN
      NON_CITIZEN
      SAME_HOUSE
      TOTAL_POP?
      WHITE
      FRAC_MINORITY
      RENT_INCOME_RATIO
      FRAC_MOVED
      FRAC_NONCITIZEN
      FRAC_FOREIN_BORN
    
  
  
    
      0
      36005000100
      BNX
      71.0
      167.0
      4851.0
      6710.0
      0.0
      194.0
      6238.0
      278.0
      ...
      6980.0
      1081.0
      1720.0
      8430.0
      1184.0
      0.604152
      1.862186
      0.795967
      0.128233
      0.172005
    
    
      1
      36005000200
      BNX
      183.0
      331.0
      1423.0
      304.0
      1.0
      55.0
      232.0
      17.0
      ...
      3536.0
      558.0
      4755.0
      5095.0
      1606.0
      0.381551
      0.624364
      0.059666
      0.109519
      0.305986
    
    
      2
      36005000400
      BNX
      50.0
      36.0
      1996.0
      450.0
      0.0
      32.0
      409.0
      9.0
      ...
      4385.0
      383.0
      5089.0
      5572.0
      1674.0
      0.376525
      0.573522
      0.080761
      0.068737
      0.213029
    
    
      3
      36005001600
      BNX
      38.0
      286.0
      1806.0
      275.0
      69.0
      8.0
      258.0
      9.0
      ...
      3733.0
      744.0
      4989.0
      5412.0
      1855.0
      0.393570
      0.707188
      0.050813
      0.137472
      0.310237
    
    
      4
      36005001900
      BNX
      47.0
      39.0
      814.0
      539.0
      5.0
      14.0
      456.0
      69.0
      ...
      1801.0
      614.0
      1999.0
      2569.0
      828.0
      0.350331
      0.894045
      0.209809
      0.239004
      0.298949
    
  

5 rows × 25 columns

Now loop on the schools, and average ACS variables across census tracts



In [21]:

    
# Define a dictionary with the census variables to be added to the dataframe
dict_var = {}
acs_col_list = df_acs.columns[2:]  # These are the census variables of interest

# Loop on the rows of the school file.
for c in acs_col_list:
    dict_var [c] = []   # Make an empty list for each column.  
                        # One element will be added to each list in 
                        # the dictionary for each school# For variables which are either FRACTIONS or MEDIAN VALUES, we take the 
# MEAN across the tracts.  For other values (corresponging to actual number of 
# respondants) we take the SUM.

for i in range (0, len (df_sch)):
    geoid_list= df_sch.ix [i][9:9+n_tracts]
    for i, c in enumerate (acs_col_list):
        if i in [9, 10, 11, 18, 19, 20, 21, 22]: mode = 'avg'
        else: mode = 'sum'
        dict_var[c].append (calc_multitract_var (df_acs, var = c, geoid_list=geoid_list, mode = mode))

df_tract_avg = pd.DataFrame(data = dict_var)   
df_tract_avg.head()









    Out[21]:






  
    
      
      2+_RACES
      ASIAN
      BLACK
      DIFFERENT_HOUSE
      DIFFERENT_HOUSE_ABROAD
      DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE
      DIFFERENT_HOUSE_SAME_CITY
      DIFFERENT_HOUSE_US_DIFFERENT_STATE
      FOREIGN_BORN_INCLUDING_NATURALIZED
      FRAC_FOREIN_BORN
      ...
      MEDIAN_AGE
      MEDIAN_INCOME
      MEDIAN_MONTHLY_HOUSING_COSTS
      NATIVE_AMERICAN
      NATIVE_CITIZEN
      NON_CITIZEN
      RENT_INCOME_RATIO
      SAME_HOUSE
      TOTAL_POP?
      WHITE
    
  
  
    
      0
      344.0
      893.0
      12406.0
      1252.0
      185.0
      50.0
      1180.0
      22.0
      6941.0
      0.426406
      ...
      36.44
      25765.4
      1343.6
      100.0
      9036.0
      2888.0
      0.641336
      14214.0
      16193.0
      1452.0
    
    
      1
      218.0
      1358.0
      3818.0
      1853.0
      181.0
      49.0
      1671.0
      133.0
      7770.0
      0.366035
      ...
      36.36
      25270.2
      1155.8
      55.0
      12243.0
      4581.0
      0.556566
      17749.0
      20013.0
      8233.0
    
    
      2
      464.0
      547.0
      7315.0
      3422.0
      174.0
      63.0
      3233.0
      126.0
      8331.0
      0.307253
      ...
      29.28
      14537.0
      898.0
      30.0
      17741.0
      5358.0
      0.765690
      22121.0
      26072.0
      3313.0
    
    
      3
      477.0
      774.0
      4355.0
      4425.0
      377.0
      477.0
      3175.0
      773.0
      7687.0
      0.336043
      ...
      26.14
      11822.4
      1063.6
      110.0
      14542.0
      5529.0
      1.334316
      17105.0
      22229.0
      6731.0
    
    
      4
      191.0
      495.0
      10548.0
      1188.0
      182.0
      29.0
      1022.0
      137.0
      4981.0
      0.342813
      ...
      34.52
      24536.6
      1281.8
      154.0
      10037.0
      2031.0
      0.624219
      13414.0
      15018.0
      1054.0
    
  

5 rows × 23 columns

Concatenate the tract-averaged data with the school outcome data



In [22]:

    
df = pd.concat ([df_sch, df_tract_avg], axis = 1)



In [23]:

    
df.head()









    Out[23]:






  
    
      
      NAME
      DBN
      STREET
      ZIPCODE
      LAT
      LON
      COUNTY
      HOOD
      DISPLAY_NAME
      GEOCODE00
      ...
      MEDIAN_AGE
      MEDIAN_INCOME
      MEDIAN_MONTHLY_HOUSING_COSTS
      NATIVE_AMERICAN
      NATIVE_CITIZEN
      NON_CITIZEN
      RENT_INCOME_RATIO
      SAME_HOUSE
      TOTAL_POP?
      WHITE
    
  
  
    
      0
      Academy for Scholarship and Entrepreneurship: ...
      11X270
      921 East 228th Street
      10466
      40.888215
      -73.852720
      Bronx County
      Wakefield
      921, East 228th Street, Wakefield, Bronx, Bron...
      36005040400
      ...
      36.44
      25765.4
      1343.6
      100.0
      9036.0
      2888.0
      0.641336
      14214.0
      16193.0
      1452.0
    
    
      1
      Astor Collegiate Academy
      11X299
      925 Astor Avenue
      10469
      40.859900
      -73.860322
      Bronx County
      Morris Park
      Christopher Columbus High School, 925, Astor A...
      36005032400
      ...
      36.36
      25270.2
      1155.8
      55.0
      12243.0
      4581.0
      0.556566
      17749.0
      20013.0
      8233.0
    
    
      2
      Banana Kelly High School
      08X530
      965 Longwood Avenue
      10459
      40.817601
      -73.897985
      Bronx County
      Melrose
      965, Longwood Avenue, Melrose, Bronx, Bronx Co...
      36005008500
      ...
      29.28
      14537.0
      898.0
      30.0
      17741.0
      5358.0
      0.765690
      22121.0
      26072.0
      3313.0
    
    
      3
      Belmont Preparatory High School
      10X434
      500 East Fordham Road
      10458
      40.859840
      -73.888295
      Bronx County
      Belmont
      500, East Fordham Road, Belmont, Bronx, Bronx ...
      36005038700
      ...
      26.14
      11822.4
      1063.6
      110.0
      14542.0
      5529.0
      1.334316
      17105.0
      22229.0
      6731.0
    
    
      4
      Bronx Academy of Health Careers
      11X290
      800 East Gun Hill Road
      10467
      40.875549
      -73.861423
      Bronx County
      Williams Bridge
      800, East Gun Hill Road, Williams Bridge, Bron...
      36005037200
      ...
      34.52
      24536.6
      1281.8
      154.0
      10037.0
      2031.0
      0.624219
      13414.0
      15018.0
      1054.0
    
  

5 rows × 115 columns

Finally clean up some of column names, and eliminate some that will not be used



In [24]:

    
df_c = pd.DataFrame() # c -> 'concise'

# Build list of columns to copy
c_list = ['NAME','DBN','STREET','ZIPCODE','LAT','LON','COUNTY','HOOD','DISPLAY_NAME']
c_list = c_list + ['GEOCODE' + str (i).zfill(2) for i in range (0, n_tracts)]
c_list = c_list + ['2+_RACES','ASIAN','BLACK','DIFFERENT_HOUSE','DIFFERENT_HOUSE_ABROAD',\
          'DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE','DIFFERENT_HOUSE_SAME_CITY',\
          'DIFFERENT_HOUSE_US_DIFFERENT_STATE','FOREIGN_BORN_INCLUDING_NATURALIZED',\
          'MEDIAN_AGE','MEDIAN_INCOME','MEDIAN_MONTHLY_HOUSING_COSTS','NATIVE_AMERICAN',\
          'NATIVE_CITIZEN','NON_CITIZEN','SAME_HOUSE','TOTAL_POP?','WHITE','FRAC_MINORITY',\
          'RENT_INCOME_RATIO','FRAC_MOVED','FRAC_NONCITIZEN','FRAC_FOREIN_BORN']
for c in c_list: df_c[c] = df[c]


# Copy and rename school outcome data
old_c_list = ['Total Cohort','Total Grads - % of cohort',\
              'Total Regents - % of cohort','Total Regents - % of grads','Advanced Regents - % of cohort',\
              'Advanced Regents - % of grads','Regents w/o Advanced - % of cohort',\
              'Regents w/o Advanced - % of grads','Local - % of cohort','Local - % of grads',\
              'Dropped Out - % of cohort','Q_Total Grads - % of cohort','Q_Total Regents - % of cohort',\
              'Q_Total Regents - % of grads','Q_Advanced Regents - % of cohort',\
              'Q_Advanced Regents - % of grads','Q_Regents w/o Advanced - % of cohort','Q_Local - % of cohort',\
              'Q_Local - % of grads','Q_Still Enrolled - % of cohort','Q_Dropped Out - % of cohort']

new_c_list = ['TOTAL_COHORT','GRADS_%','REGENTS_%_COHORT','REGENTS_%_GRADS'\
              ,'ADV_REGENTS_%_COHORT','ADV_REGENTS_%_GRADS','REG_REGENTS_%_COHORT','REG_REGENTS_%_GRADS'\
              ,'LOCAL_%_COHORT','LOCAL_%_GRADS','DROPPED_OUT_%','Q_GRADS_%',\
              'Q_REGENTS_%_COHORT','Q_REGENTS_%_GRADS','Q_ADV_REGENTS_%_COHORT',\
              'Q_ADV_REGENTS_%_GRADS','Q_REG_REGENTS_%_COHORT','Q_LOCAL_%_COHORT',\
              'Q_LOCAL_%_GRADS','Q_STILL_ENROLLED_%','Q_DROPPED_OUT_%']

for old_c, new_c in zip (old_c_list, new_c_list):
    df_c[new_c] = df[old_c]
    

#There are some empties -- drop rows with NaN
df_c = df_c.dropna()

# Save the 'concise' dataframe
fp_out = os.path.join (bp_data, 'df_C_sch_acs_NTract=' + str (n_tracts).zfill(2) + '.csv')
df_c.to_csv (fp_out)



In [ ]:



In [ ]:

	GEOID	BOR	2+_RACES	ASIAN	BLACK	DIFFERENT_HOUSE	DIFFERENT_HOUSE_ABROAD	DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE	DIFFERENT_HOUSE_SAME_CITY	DIFFERENT_HOUSE_US_DIFFERENT_STATE	...	NATIVE_CITIZEN	NON_CITIZEN	SAME_HOUSE	TOTAL_POP?	WHITE	FRAC_MINORITY	RENT_INCOME_RATIO	FRAC_MOVED	FRAC_NONCITIZEN	FRAC_FOREIN_BORN
0	36005000100	BNX	71.0	167.0	4851.0	6710.0	0.0	194.0	6238.0	278.0	...	6980.0	1081.0	1720.0	8430.0	1184.0	0.604152	1.862186	0.795967	0.128233	0.172005
1	36005000200	BNX	183.0	331.0	1423.0	304.0	1.0	55.0	232.0	17.0	...	3536.0	558.0	4755.0	5095.0	1606.0	0.381551	0.624364	0.059666	0.109519	0.305986
2	36005000400	BNX	50.0	36.0	1996.0	450.0	0.0	32.0	409.0	9.0	...	4385.0	383.0	5089.0	5572.0	1674.0	0.376525	0.573522	0.080761	0.068737	0.213029
3	36005001600	BNX	38.0	286.0	1806.0	275.0	69.0	8.0	258.0	9.0	...	3733.0	744.0	4989.0	5412.0	1855.0	0.393570	0.707188	0.050813	0.137472	0.310237
4	36005001900	BNX	47.0	39.0	814.0	539.0	5.0	14.0	456.0	69.0	...	1801.0	614.0	1999.0	2569.0	828.0	0.350331	0.894045	0.209809	0.239004	0.298949

	2+_RACES	ASIAN	BLACK	DIFFERENT_HOUSE	DIFFERENT_HOUSE_ABROAD	DIFFERENT_HOUSE_DIFFERENT_CITY_SAME_STATE	DIFFERENT_HOUSE_SAME_CITY	DIFFERENT_HOUSE_US_DIFFERENT_STATE	FOREIGN_BORN_INCLUDING_NATURALIZED	FRAC_FOREIN_BORN	...	MEDIAN_AGE	MEDIAN_INCOME	MEDIAN_MONTHLY_HOUSING_COSTS	NATIVE_AMERICAN	NATIVE_CITIZEN	NON_CITIZEN	RENT_INCOME_RATIO	SAME_HOUSE	TOTAL_POP?	WHITE
0	344.0	893.0	12406.0	1252.0	185.0	50.0	1180.0	22.0	6941.0	0.426406	...	36.44	25765.4	1343.6	100.0	9036.0	2888.0	0.641336	14214.0	16193.0	1452.0
1	218.0	1358.0	3818.0	1853.0	181.0	49.0	1671.0	133.0	7770.0	0.366035	...	36.36	25270.2	1155.8	55.0	12243.0	4581.0	0.556566	17749.0	20013.0	8233.0
2	464.0	547.0	7315.0	3422.0	174.0	63.0	3233.0	126.0	8331.0	0.307253	...	29.28	14537.0	898.0	30.0	17741.0	5358.0	0.765690	22121.0	26072.0	3313.0
3	477.0	774.0	4355.0	4425.0	377.0	477.0	3175.0	773.0	7687.0	0.336043	...	26.14	11822.4	1063.6	110.0	14542.0	5529.0	1.334316	17105.0	22229.0	6731.0
4	191.0	495.0	10548.0	1188.0	182.0	29.0	1022.0	137.0	4981.0	0.342813	...	34.52	24536.6	1281.8	154.0	10037.0	2031.0	0.624219	13414.0	15018.0	1054.0

	NAME	DBN	STREET	ZIPCODE	LAT	LON	COUNTY	HOOD	DISPLAY_NAME	GEOCODE00	...	MEDIAN_AGE	MEDIAN_INCOME	MEDIAN_MONTHLY_HOUSING_COSTS	NATIVE_AMERICAN	NATIVE_CITIZEN	NON_CITIZEN	RENT_INCOME_RATIO	SAME_HOUSE	TOTAL_POP?	WHITE
0	Academy for Scholarship and Entrepreneurship: ...	11X270	921 East 228th Street	10466	40.888215	-73.852720	Bronx County	Wakefield	921, East 228th Street, Wakefield, Bronx, Bron...	36005040400	...	36.44	25765.4	1343.6	100.0	9036.0	2888.0	0.641336	14214.0	16193.0	1452.0
1	Astor Collegiate Academy	11X299	925 Astor Avenue	10469	40.859900	-73.860322	Bronx County	Morris Park	Christopher Columbus High School, 925, Astor A...	36005032400	...	36.36	25270.2	1155.8	55.0	12243.0	4581.0	0.556566	17749.0	20013.0	8233.0
2	Banana Kelly High School	08X530	965 Longwood Avenue	10459	40.817601	-73.897985	Bronx County	Melrose	965, Longwood Avenue, Melrose, Bronx, Bronx Co...	36005008500	...	29.28	14537.0	898.0	30.0	17741.0	5358.0	0.765690	22121.0	26072.0	3313.0
3	Belmont Preparatory High School	10X434	500 East Fordham Road	10458	40.859840	-73.888295	Bronx County	Belmont	500, East Fordham Road, Belmont, Bronx, Bronx ...	36005038700	...	26.14	11822.4	1063.6	110.0	14542.0	5529.0	1.334316	17105.0	22229.0	6731.0
4	Bronx Academy of Health Careers	11X290	800 East Gun Hill Road	10467	40.875549	-73.861423	Bronx County	Williams Bridge	800, East Gun Hill Road, Williams Bridge, Bron...	36005037200	...	34.52	24536.6	1281.8	154.0	10037.0	2031.0	0.624219	13414.0	15018.0	1054.0