Note: I did not make changes to the Pull data file


In [1]:
import geopandas as gp
import pandas as pd
import numpy as np
import os
from shapely.geometry import Point

Borough Code:

1 - Manhattan

2 - Bronx

3 - Brooklyn

4 - Queens

5 - Staten Island


In [3]:
#Downloading census data with geo_location
#https://data.cityofnewyork.us/api/geospatial/fxpq-c8ku?method=export&format=GeoJSON
census = gp.read_file('../data/external/census-tracts.geojson')
census.head()


Out[3]:
boro_code boro_ct_2010 boro_name cdeligibil ct_2010 ctlabel geometry ntacode ntaname puma shape_area shape_leng
0 5 5000900 Staten Island I 000900 9 (POLYGON ((-74.07920577013245 40.6434307837456... SI22 West New Brighton-New Brighton-St. George 3903 2497009.69813 7729.01679376
1 5 5007400 Staten Island I 007400 74 (POLYGON ((-74.05974734759452 40.5938486115672... SI14 Grasmere-Arrochar-Ft. Wadsworth 3902 5788237.79601 9902.94847281
2 1 1003200 Manhattan I 003200 32 (POLYGON ((-73.97990650235904 40.7268657730023... MN22 East Village 3809 2334190.23228 6358.38668446
3 1 1009800 Manhattan I 009800 98 (POLYGON ((-73.96432543478758 40.7563815309909... MN19 Turtle Bay-East Midtown 3808 1906016.35002 5534.19981063
4 1 1010000 Manhattan I 010000 100 (POLYGON ((-73.96802436915851 40.7595781400528... MN19 Turtle Bay-East Midtown 3808 1860938.37721 5692.16873705

In [4]:
#Converting census tract data into a numeric value
census['census_tract'] = pd.to_numeric(census['ct_2010'])
census['boro_code'] = pd.to_numeric(census.boro_code)

In [5]:
census.columns


Out[5]:
Index(['boro_code', 'boro_ct_2010', 'boro_name', 'cdeligibil', 'ct_2010',
       'ctlabel', 'geometry', 'ntacode', 'ntaname', 'puma', 'shape_area',
       'shape_leng', 'census_tract'],
      dtype='object')

In [6]:
#Dropping extraneous data
census.drop([u'boro_ct_2010',    u'boro_name',   u'cdeligibil',
            u'ct_2010',      u'ctlabel', u'ntacode',    u'ntaname',         u'puma'], inplace = True, axis = 1)
census.head()


Out[6]:
boro_code geometry shape_area shape_leng census_tract
0 5 (POLYGON ((-74.07920577013245 40.6434307837456... 2497009.69813 7729.01679376 900
1 5 (POLYGON ((-74.05974734759452 40.5938486115672... 5788237.79601 9902.94847281 7400
2 1 (POLYGON ((-73.97990650235904 40.7268657730023... 2334190.23228 6358.38668446 3200
3 1 (POLYGON ((-73.96432543478758 40.7563815309909... 1906016.35002 5534.19981063 9800
4 1 (POLYGON ((-73.96802436915851 40.7595781400528... 1860938.37721 5692.16873705 10000

In [7]:
#Checking size
census.shape


Out[7]:
(2166, 5)

In [9]:
#Downloading census tract data with population
pop_by_census = pd.read_csv('../data/external/nyc-population-census.csv')
pop_by_census.head()


Out[9]:
Borough Year FIPS County Code DCP Borough Code Census Tract Population
0 Bronx 2000 5 2 100 12780
1 Bronx 2000 5 2 200 3545
2 Bronx 2000 5 2 400 3314
3 Bronx 2000 5 2 1600 5237
4 Bronx 2000 5 2 1900 1584

In [10]:
#Taking census by 2010 population
pop_by_census = pop_by_census[pop_by_census.Year == 2010]
total_pop = pop_by_census['Population'].sum()
pop_by_census['density'] = pop_by_census['Population']/total_pop
pop_by_census.head()


Out[10]:
Borough Year FIPS County Code DCP Borough Code Census Tract Population density
2168 Bronx 2010 5 2 100 11091 0.001357
2169 Bronx 2010 5 2 200 4334 0.000530
2170 Bronx 2010 5 2 400 5503 0.000673
2171 Bronx 2010 5 2 1600 5643 0.000690
2172 Bronx 2010 5 2 1900 1917 0.000234

In [11]:
pop_by_census.drop([u'Borough', u'Year', u'FIPS County Code'], axis = 1, inplace = True)
pop_by_census.head()


Out[11]:
DCP Borough Code Census Tract Population density
2168 2 100 11091 0.001357
2169 2 200 4334 0.000530
2170 2 400 5503 0.000673
2171 2 1600 5643 0.000690
2172 2 1900 1917 0.000234

In [12]:
#Renaming columns
pop_by_census.rename(columns = { 'DCP Borough Code' : 'boro_code', 'Census Tract': 'census_tract' }, inplace= True)
# pop_by_census.head()
pop_by_census.shape


Out[12]:
(2168, 4)

In [13]:
#Merging population with geo_file
census_pop = pd.merge(pop_by_census, census, on = ['census_tract', 'boro_code'], how = 'inner')
census_pop.crs = {'init' :'epsg:4326'}
census_pop.head()


Out[13]:
boro_code census_tract Population density geometry shape_area shape_leng
0 2 100 11091 0.001357 (POLYGON ((-73.87287195903875 40.7859750278047... 18154596.0081 18903.3467294
1 2 200 4334 0.000530 (POLYGON ((-73.85651604030653 40.8052412204751... 5004821.2311 15591.2827425
2 2 400 5503 0.000673 (POLYGON ((-73.84610660457847 40.8130999892054... 8562150.11049 24707.0790039
3 2 1600 5643 0.000690 (POLYGON ((-73.85513639815333 40.8224361893100... 5221330.06703 9671.30620489
4 2 1900 1917 0.000234 (POLYGON ((-73.89680883223774 40.7958084451597... 17964481.0319 29989.8448165

In [16]:
# Importing CitiBike Data
stations = pd.read_csv('../data/processed/stations.csv')
stations.head()


Out[16]:
Station_id Station_Name Location Latitude Longitude Zip
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 10019
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 10013
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 10038
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 11217
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 10011

In [ ]:
# ### Point sjoin
# geometry = gp.GeoSeries([Point(xy) for xy in zip(stations.Longitude, stations.Latitude)])
# point_stations = gp.GeoDataFrame(stations, geometry=geometry)
# point_stations.crs = {'init' :'epsg:4326'}
# point_stations.to_file('geo_stations')
# point_citibike_popdensity = gp.sjoin(point_stations, census_pop, how = 'inner', op = 'intersects')
# point_citibike_popdensity.head()

In [17]:
#Creating Buffer
geometry = gp.GeoSeries([Point(xy) for xy in zip(stations.Longitude, stations.Latitude)])
geometry = geometry.buffer(.0005)  # Using buffer of 0.0005
geo_stations = gp.GeoDataFrame(stations, geometry=geometry)
geo_stations.crs = {'init' :'epsg:4326'}
geo_stations.head()


Out[17]:
Station_id Station_Name Location Latitude Longitude Zip geometry
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 10019 POLYGON ((-73.99342888 40.76727216, -73.993431...
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 10013 POLYGON ((-74.00616660999999 40.71911552, -74....
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 10038 POLYGON ((-73.99966544999999 40.71117416, -73....
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 11217 POLYGON ((-73.97582328 40.68382604, -73.975825...
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 10011 POLYGON ((-74.00099745999999 40.74177603, -74....

In [18]:
citibike_popdensity = gp.sjoin(geo_stations, census_pop, how = 'inner', op = 'intersects')
citibike_popdensity.head()


Out[18]:
Station_id Station_Name Location Latitude Longitude Zip geometry index_right boro_code census_tract Population density shape_area shape_leng
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 10019 POLYGON ((-73.99342888 40.76727216, -73.993431... 1234 1 13500 6596 0.000807 4513475.18656 15450.2036171
237 480 W 53 St & 10 Ave W 53 St & 10 Ave 40.766697 -73.990617 10019 POLYGON ((-73.99011727999999 40.76669671, -73.... 1234 1 13500 6596 0.000807 4513475.18656 15450.2036171
267 513 W 56 St & 10 Ave W 56 St & 10 Ave 40.768254 -73.988639 10019 POLYGON ((-73.988139 40.768254, -73.9881414076... 1234 1 13500 6596 0.000807 4513475.18656 15450.2036171
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 10013 POLYGON ((-74.00616660999999 40.71911552, -74.... 1132 1 3300 5156 0.000631 3677727.6405 8170.75614772
12 146 Hudson St & Reade St Hudson St & Reade St 40.716250 -74.009106 10013 POLYGON ((-74.00860589999999 40.71625008, -74.... 1132 1 3300 5156 0.000631 3677727.6405 8170.75614772

In [19]:
#Checking Boroughs where citibike stands are placed
citibike_popdensity.boro_code.unique()


Out[19]:
array([1, 3, 4])

In [20]:
citibike_popdensity.columns


Out[20]:
Index(['Station_id', 'Station_Name', 'Location', 'Latitude', 'Longitude',
       'Zip', 'geometry', 'index_right', 'boro_code', 'census_tract',
       'Population', 'density', 'shape_area', 'shape_leng'],
      dtype='object')

In [21]:
#Dropping data
citibike_popdensity.drop([u'Location',     u'Latitude',    u'Longitude', u'index_right' ], axis=1, inplace=True)
citibike_popdensity.head()


Out[21]:
Station_id Station_Name Zip geometry boro_code census_tract Population density shape_area shape_leng
0 72 W 52 St & 11 Ave 10019 POLYGON ((-73.99342888 40.76727216, -73.993431... 1 13500 6596 0.000807 4513475.18656 15450.2036171
237 480 W 53 St & 10 Ave 10019 POLYGON ((-73.99011727999999 40.76669671, -73.... 1 13500 6596 0.000807 4513475.18656 15450.2036171
267 513 W 56 St & 10 Ave 10019 POLYGON ((-73.988139 40.768254, -73.9881414076... 1 13500 6596 0.000807 4513475.18656 15450.2036171
1 79 Franklin St & W Broadway 10013 POLYGON ((-74.00616660999999 40.71911552, -74.... 1 3300 5156 0.000631 3677727.6405 8170.75614772
12 146 Hudson St & Reade St 10013 POLYGON ((-74.00860589999999 40.71625008, -74.... 1 3300 5156 0.000631 3677727.6405 8170.75614772

In [22]:
citibike_popdensity.shape


Out[22]:
(1099, 10)

In [23]:
# Count of citibike stations 
(citibike_popdensity.groupby(['Station_id'])[['density']].count()).head(10)


Out[23]:
density
Station_id
72 1
79 1
82 3
83 3
116 2
119 2
120 3
127 3
128 2
137 2

In [24]:
#Calculating population density around a citibike stand by grouing the data
#Using mean to calculate the average of citibike stops intersecting 3 buffers
grouped_data = citibike_popdensity.groupby(['Station_id', 'boro_code'])[['density']].mean()
grouped_data.reset_index(inplace=True)
grouped_data.head()


Out[24]:
Station_id boro_code density
0 72 1 0.000807
1 79 1 0.000631
2 82 1 0.000511
3 83 3 0.000231
4 116 1 0.000742

In [25]:
grouped_data.to_csv('../data/processed/pop-density.csv')

In [ ]:
# #Merging with census tract data (0.0005 Buffer)
# merged_citibike_avgdensity = pd.merge(grouped_data, point_citibike_popdensity, on = 'Station_id', how = 'inner' )
# merged_citibike_avgdensity.head()

In [ ]:
# #Merging with census tract data (Point)
# merged_citibike_avgdensity = pd.merge(grouped_data, citibike_popdensity, on = 'Station_id', how = 'inner' )
# merged_citibike_avgdensity.head()

In [ ]:
# merged_citibike_avgdensity.columns

In [ ]:
# # For buffer = 0.0005
# merged_citibike_avgdensity.drop([ u'Station_Name', u'geometry',  u'boro_code_y',    u'Population', \
#             u'density',   u'shape_area',   u'shape_leng', 'census_tract'], axis = 1, inplace  = True)

In [ ]:
#Dropping unnecessary data
# # For point buffer
# merged_citibike_avgdensity.drop([ u'Station_Name', u'geometry',  u'boro_code_y', u'Population', \
#             u'density',   u'shape_area',   u'shape_leng', u'Latitude',    u'Longitude', \
#                             u'index_right',  u'Location', 'census_tract'], axis = 1, inplace = True)

In [ ]:
# merged_citibike_avgdensity.head()

In [ ]:
# #Average population density per citibike stand 
# merged_citibike_avgdensity.rename(columns={'boro_code_x' : 'Borough_code'}, inplace = True)
# merged_citibike_avgdensity.head()

In [ ]:
# merged_citibike_avgdensity = merged_citibike_avgdensity.groupby(['Station_id']).mean()
# merged_citibike_avgdensity.reset_index(inplace=True)

In [ ]:
# merged_citibike_avgdensity.head()

Questions:

  1. Is census tract data a good reference dataset to calculate population density around a citibike station? Do we need a larger or smaller dataset?
  2. Should we capture the census tract information from where the population density is calculated?
  3. Is the population density calculated by the formula density = pop_tract/total_population a good indicator of population density?