Load all datasets into dataframes


In [391]:
import pandas as pd
import numpy as np

Stations


In [392]:
stations = pd.read_csv("../data/processed/stations.csv")

In [393]:
stations.head()


Out[393]:
Station_id Station_Name Location Latitude Longitude
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497

In [394]:
len(stations)


Out[394]:
664

Ridership


In [395]:
ridership = pd.read_csv("../data/processed/ridership.csv", index_col=0)

In [396]:
ridership.head()


Out[396]:
Station_id count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015
0 72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 2323.0 1648.0 2131.615385
1 79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 2074.0 1579.0 1760.538462
2 82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 819.0 713.0 766.538462
3 83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 886.0 717.0 863.307692
4 116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 4455.0 3686.0 3576.692308

In [397]:
len(ridership)


Out[397]:
488

Bike Lanes


In [398]:
bike_lanes = pd.read_csv("../data/processed/bike-lane-quality.csv", index_col=0)

In [399]:
bike_lanes.head()


Out[399]:
Station_id metric
0 79 3.0
1 116 4.0
2 119 4.0
3 120 4.0
4 127 4.0

In [400]:
len(bike_lanes)


Out[400]:
352

Parks


In [401]:
parks = pd.read_csv("../data/processed/parks.csv", 
                    usecols=['Station_id', 'signname', 'park'])

In [402]:
parks.head()


Out[402]:
Station_id signname park
0 72 De Witt Clinton Park 1
1 82 James Madison Plaza 1
2 116 Dr. Gertrude B. Kelly Playground 1
3 119 Commodore Barry Park 1
4 128 Father Fagan Park 1

In [403]:
len(parks)


Out[403]:
231

In [404]:
len(parks.signname.unique())


Out[404]:
153

In [405]:
parks = parks.groupby(['Station_id']).sum()
parks.reset_index(inplace=True)

In [406]:
parks.head()


Out[406]:
Station_id park
0 72 1
1 82 1
2 116 1
3 119 1
4 128 1

In [407]:
parks.describe()


Out[407]:
Station_id park
count 194.000000 194.000000
mean 1896.402062 1.190722
std 1435.396904 0.477164
min 72.000000 1.000000
25% 359.500000 1.000000
50% 3061.000000 1.000000
75% 3238.500000 1.000000
max 3437.000000 4.000000

Street Quality


In [408]:
street_qual = pd.read_csv("../data/processed/street-assessment.csv", index_col=0)

In [409]:
street_qual.head()


Out[409]:
Station_id Rating_B
0 72 8.000000
1 79 8.571429
2 82 7.333333
3 83 7.500000
4 116 8.500000

In [410]:
len(street_qual)


Out[410]:
604

Subway Entrance


In [411]:
subway_ent = pd.read_csv("../data/processed/subway-entrances.csv", usecols=['Station_id', 'line', 'subway_entrance'])

In [412]:
subway_ent.head()


Out[412]:
Station_id line subway_entrance
0 79 1 1
1 151 4-6-6 Express 1
2 223 1-2-3-L-F-M 1
3 257 4-6-6 Express-J-Z-N-Q-R 1
4 267 B-D-F-M-N-Q-R 1

In [413]:
len(subway_ent)


Out[413]:
73

Trees


In [414]:
trees = pd.read_csv("../data/processed/tree-canopy.csv")

In [415]:
trees.head()


Out[415]:
Station_id score score_mean tree_count station_id
0 72 17.364799 2.480686 7 72
1 79 9.573955 3.191318 3 79
2 82 35.070325 3.188211 11 82
3 83 0.000000 0.000000 0 83
4 116 47.824344 3.188290 15 116

In [416]:
len(trees)


Out[416]:
664

Traffic Volume


In [417]:
traf_vol = pd.read_csv("../data/processed/traffic-volume.csv", index_col=0)

In [418]:
traf_vol.head()


Out[418]:
Station_id AADT
0 72 14870.500000
1 79 9484.666667
2 82 16812.500000
3 83 41976.000000
4 116 15948.000000

In [419]:
len(traf_vol)


Out[419]:
535

Income


In [420]:
income = pd.read_csv('../data/processed/income.csv', usecols=['Station_id', 'Median_Househould_Income'])

In [421]:
income.head()


Out[421]:
Station_id Median_Househould_Income
0 72 90,174
1 79 NaN
2 82 73,988
3 83 85,199
4 116 104,974

In [422]:
income['Median_Househould_Income'] = income['Median_Househould_Income'].str.replace('[-,+]', '')
income = income.replace('',np.nan)

In [423]:
income['Median_Househould_Income'].unique()


Out[423]:
array(['90174', nan, '73988', '85199', '104974', '47050', '115604',
       '98775', '245556', '61154', '95577', '250000', '103960', '109557',
       '102165', '97524', '97228', '123409', '32966', '100298', '69491',
       '121840', '45566', '143636', '81445', '100763', '70625', '50943',
       '111265', '121316', '34804', '34432', '39785', '30686', '66610',
       '48509', '128693', '110163', '109398', '115485', '104583', '109784',
       '172926', '100649', '71865', '31446', '99040', '83383', '47318',
       '60135'], dtype=object)

In [424]:
income.Median_Househould_Income = income['Median_Househould_Income'].astype('float')

In [425]:
len(income)


Out[425]:
664

Population Density


In [426]:
pop_density = pd.read_csv('../data/processed/pop_density.csv', usecols=['Station_id', 'density'])

In [427]:
pop_density.head()


Out[427]:
Station_id density
0 72 0.000807
1 79 0.000631
2 82 0.000511
3 83 0.000231
4 116 0.000742

In [428]:
len(pop_density)


Out[428]:
589

Merge datasets

Stations + Ridership


In [429]:
sta_ri = pd.merge(stations, ridership, on='Station_id', how='inner')
sta_ri.head()


Out[429]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 2323.0 1648.0 2131.615385
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 2074.0 1579.0 1760.538462
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 819.0 713.0 766.538462
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 886.0 717.0 863.307692
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 4455.0 3686.0 3576.692308

In [430]:
len(sta_ri)


Out[430]:
452

In [431]:
sta_ri.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[431]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 390.000000 426.000000 438.000000 445.000000 449.000000 452.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 2799.902564 2809.215962 2616.004566 2103.451685 1698.944321 1957.844394
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 2427.176146 2256.800195 2064.072542 1676.621238 1366.409411 1288.669426
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 15.000000 8.000000 10.000000 20.000000 13.000000 152.076923
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 874.333333
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1694.600000
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2727.407692
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 12611.000000 14063.000000 11340.000000 11267.000000 8843.000000 8102.461538

Stations + Ridership + Bike Lanes


In [432]:
sta_ri_bi = pd.merge(sta_ri, bike_lanes, on='Station_id', how='left')
sta_ri_bi.head()


Out[432]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 2323.0 1648.0 2131.615385 NaN
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 2074.0 1579.0 1760.538462 3.0
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 819.0 713.0 766.538462 NaN
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 886.0 717.0 863.307692 NaN
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 4455.0 3686.0 3576.692308 4.0

In [433]:
len(sta_ri_bi)


Out[433]:
452

In [434]:
sta_ri_bi.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[434]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 390.000000 426.000000 438.000000 445.000000 449.000000 452.000000 254.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 2799.902564 2809.215962 2616.004566 2103.451685 1698.944321 1957.844394 3.148046
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 2427.176146 2256.800195 2064.072542 1676.621238 1366.409411 1288.669426 0.906666
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 15.000000 8.000000 10.000000 20.000000 13.000000 152.076923 1.000000
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 874.333333 NaN
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1694.600000 NaN
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2727.407692 NaN
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 12611.000000 14063.000000 11340.000000 11267.000000 8843.000000 8102.461538 5.000000

In [435]:
sta_ri_bi.metric.isnull().sum()


Out[435]:
198

Stations + Ridership + Bike Lanes + Parks


In [436]:
sta_ri_bi_pa = pd.merge(sta_ri_bi, parks, on='Station_id', how='left')
sta_ri_bi_pa.head()


Out[436]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric park
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 2323.0 1648.0 2131.615385 NaN 1.0
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 2074.0 1579.0 1760.538462 3.0 NaN
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 819.0 713.0 766.538462 NaN 1.0
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 886.0 717.0 863.307692 NaN NaN
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 4455.0 3686.0 3576.692308 4.0 1.0

In [437]:
len(sta_ri_bi_pa)


Out[437]:
452

In [438]:
sta_ri_bi_pa.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[438]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric park
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 390.000000 426.000000 438.000000 445.000000 449.000000 452.000000 254.000000 146.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 2799.902564 2809.215962 2616.004566 2103.451685 1698.944321 1957.844394 3.148046 1.219178
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 2427.176146 2256.800195 2064.072542 1676.621238 1366.409411 1288.669426 0.906666 0.518528
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 15.000000 8.000000 10.000000 20.000000 13.000000 152.076923 1.000000 1.000000
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 874.333333 NaN NaN
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1694.600000 NaN NaN
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2727.407692 NaN NaN
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 12611.000000 14063.000000 11340.000000 11267.000000 8843.000000 8102.461538 5.000000 4.000000

In [439]:
sta_ri_bi_pa.park.isnull().sum()


Out[439]:
306

Stations + Ridership + Bike Lanes + Parks + Street Quality


In [440]:
sta_ri_bi_pa_str = pd.merge(sta_ri_bi_pa, street_qual, on='Station_id', how='left')
sta_ri_bi_pa_str.head()


Out[440]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 ... count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric park Rating_B
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 ... 3149.0 3504.0 3667.0 3546.0 2323.0 1648.0 2131.615385 NaN 1.0 8.000000
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 ... 2626.0 2726.0 3011.0 2646.0 2074.0 1579.0 1760.538462 3.0 NaN 8.571429
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 ... 1193.0 1145.0 1166.0 1053.0 819.0 713.0 766.538462 NaN 1.0 7.333333
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 ... 1659.0 1724.0 1505.0 1104.0 886.0 717.0 863.307692 NaN NaN 7.500000
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 ... 5674.0 6175.0 6558.0 5825.0 4455.0 3686.0 3576.692308 4.0 1.0 8.500000

5 rows × 21 columns


In [441]:
len(sta_ri_bi_pa_str)


Out[441]:
452

In [442]:
sta_ri_bi_pa_str.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[442]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric park Rating_B
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 390.000000 426.000000 438.000000 445.000000 449.000000 452.000000 254.000000 146.000000 444.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 2799.902564 2809.215962 2616.004566 2103.451685 1698.944321 1957.844394 3.148046 1.219178 7.430723
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 2427.176146 2256.800195 2064.072542 1676.621238 1366.409411 1288.669426 0.906666 0.518528 0.726463
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 15.000000 8.000000 10.000000 20.000000 13.000000 152.076923 1.000000 1.000000 4.000000
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 874.333333 NaN NaN NaN
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1694.600000 NaN NaN NaN
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2727.407692 NaN NaN NaN
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 12611.000000 14063.000000 11340.000000 11267.000000 8843.000000 8102.461538 5.000000 4.000000 9.111111

In [443]:
sta_ri_bi_pa_str.Rating_B.isnull().sum()


Out[443]:
8

Stations + Ridership + Bike Lanes + Parks + Street Quality + Subway Entrances


In [444]:
sta_ri_bi_pa_str_sub = pd.merge(sta_ri_bi_pa_str, subway_ent, on='Station_id', how='left')
sta_ri_bi_pa_str_sub.head()


Out[444]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 ... count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric park Rating_B line subway_entrance
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 ... 3667.0 3546.0 2323.0 1648.0 2131.615385 NaN 1.0 8.000000 NaN NaN
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 ... 3011.0 2646.0 2074.0 1579.0 1760.538462 3.0 NaN 8.571429 1 1.0
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 ... 1166.0 1053.0 819.0 713.0 766.538462 NaN 1.0 7.333333 NaN NaN
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 ... 1505.0 1104.0 886.0 717.0 863.307692 NaN NaN 7.500000 NaN NaN
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 ... 6558.0 5825.0 4455.0 3686.0 3576.692308 4.0 1.0 8.500000 NaN NaN

5 rows × 23 columns


In [445]:
len(sta_ri_bi_pa_str_sub)


Out[445]:
452

In [446]:
sta_ri_bi_pa_str_sub.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[446]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 count of rides 201510 count of rides 201511 count of rides 201512 average station ridership 2015 metric park Rating_B subway_entrance
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 390.000000 426.000000 438.000000 445.000000 449.000000 452.000000 254.000000 146.000000 444.000000 58.0
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 2799.902564 2809.215962 2616.004566 2103.451685 1698.944321 1957.844394 3.148046 1.219178 7.430723 1.0
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 2427.176146 2256.800195 2064.072542 1676.621238 1366.409411 1288.669426 0.906666 0.518528 0.726463 0.0
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 15.000000 8.000000 10.000000 20.000000 13.000000 152.076923 1.000000 1.000000 4.000000 1.0
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 874.333333 NaN NaN NaN NaN
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1694.600000 NaN NaN NaN NaN
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2727.407692 NaN NaN NaN NaN
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 12611.000000 14063.000000 11340.000000 11267.000000 8843.000000 8102.461538 5.000000 4.000000 9.111111 1.0

In [447]:
sta_ri_bi_pa_str_sub.subway_entrance.isnull().sum()


Out[447]:
394

Stations + Ridership + Bike Lanes + Parks + Street Quality + Subway Entrances + Trees


In [448]:
sta_ri_bi_pa_str_sub_tr = pd.merge(sta_ri_bi_pa_str_sub, trees, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr.head()


Out[448]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 ... average station ridership 2015 metric park Rating_B line subway_entrance score score_mean tree_count station_id
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 ... 2131.615385 NaN 1.0 8.000000 NaN NaN 17.364799 2.480686 7 72
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 ... 1760.538462 3.0 NaN 8.571429 1 1.0 9.573955 3.191318 3 79
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 ... 766.538462 NaN 1.0 7.333333 NaN NaN 35.070325 3.188211 11 82
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 ... 863.307692 NaN NaN 7.500000 NaN NaN 0.000000 0.000000 0 83
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 ... 3576.692308 4.0 1.0 8.500000 NaN NaN 47.824344 3.188290 15 116

5 rows × 27 columns


In [449]:
len(sta_ri_bi_pa_str_sub_tr)


Out[449]:
452

In [450]:
sta_ri_bi_pa_str_sub_tr.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[450]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 ... count of rides 201512 average station ridership 2015 metric park Rating_B subway_entrance score score_mean tree_count station_id
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 ... 449.000000 452.000000 254.000000 146.000000 444.000000 58.0 452.000000 452.000000 452.000000 452.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 ... 1698.944321 1957.844394 3.148046 1.219178 7.430723 1.0 29.709765 2.531139 10.232301 1292.373894
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 ... 1366.409411 1288.669426 0.906666 0.518528 0.726463 0.0 22.256510 1.062897 7.284009 1282.317184
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 ... 13.000000 152.076923 1.000000 1.000000 4.000000 1.0 0.000000 0.000000 0.000000 72.000000
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN ... NaN 874.333333 NaN NaN NaN NaN 12.295779 2.293447 5.000000 338.500000
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN ... NaN 1694.600000 NaN NaN NaN NaN 27.596242 2.746607 10.000000 468.500000
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN ... NaN 2727.407692 NaN NaN NaN NaN 43.585371 3.181937 15.000000 3070.250000
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 ... 8843.000000 8102.461538 5.000000 4.000000 9.111111 1.0 130.979643 4.889364 40.000000 3242.000000

8 rows × 24 columns


In [451]:
sta_ri_bi_pa_str_sub_tr.score_mean.isnull().sum()


Out[451]:
0

Stations + Ridership + Bike Lanes + Parks + Street Quality + Subway Entrances + Trees + Traffic Volume


In [452]:
sta_ri_bi_pa_str_sub_tr_tv = pd.merge(sta_ri_bi_pa_str_sub_tr, traf_vol, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr_tv.head()


Out[452]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 ... metric park Rating_B line subway_entrance score score_mean tree_count station_id AADT
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 ... NaN 1.0 8.000000 NaN NaN 17.364799 2.480686 7 72 14870.500000
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 ... 3.0 NaN 8.571429 1 1.0 9.573955 3.191318 3 79 9484.666667
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 ... NaN 1.0 7.333333 NaN NaN 35.070325 3.188211 11 82 16812.500000
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 ... NaN NaN 7.500000 NaN NaN 0.000000 0.000000 0 83 41976.000000
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 ... 4.0 1.0 8.500000 NaN NaN 47.824344 3.188290 15 116 15948.000000

5 rows × 28 columns


In [453]:
len(sta_ri_bi_pa_str_sub_tr_tv)


Out[453]:
452

In [454]:
sta_ri_bi_pa_str_sub_tr_tv.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[454]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 ... average station ridership 2015 metric park Rating_B subway_entrance score score_mean tree_count station_id AADT
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 ... 452.000000 254.000000 146.000000 444.000000 58.0 452.000000 452.000000 452.000000 452.000000 397.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 ... 1957.844394 3.148046 1.219178 7.430723 1.0 29.709765 2.531139 10.232301 1292.373894 14851.612343
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 ... 1288.669426 0.906666 0.518528 0.726463 0.0 22.256510 1.062897 7.284009 1282.317184 13496.930454
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 ... 152.076923 1.000000 1.000000 4.000000 1.0 0.000000 0.000000 0.000000 72.000000 815.000000
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN ... 874.333333 NaN NaN NaN NaN 12.295779 2.293447 5.000000 338.500000 NaN
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN ... 1694.600000 NaN NaN NaN NaN 27.596242 2.746607 10.000000 468.500000 NaN
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN ... 2727.407692 NaN NaN NaN NaN 43.585371 3.181937 15.000000 3070.250000 NaN
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 ... 8102.461538 5.000000 4.000000 9.111111 1.0 130.979643 4.889364 40.000000 3242.000000 136343.000000

8 rows × 25 columns


In [455]:
sta_ri_bi_pa_str_sub_tr_tv.AADT.isnull().sum()


Out[455]:
55

Stations + Ridership + Bike Lanes + Parks + Street Quality + Subway Entrances + Trees + Traffic Volume + Income


In [456]:
sta_ri_bi_pa_str_sub_tr_tv_inc = pd.merge(sta_ri_bi_pa_str_sub_tr_tv, income, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr_tv_inc.head()


Out[456]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 ... park Rating_B line subway_entrance score score_mean tree_count station_id AADT Median_Househould_Income
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 ... 1.0 8.000000 NaN NaN 17.364799 2.480686 7 72 14870.500000 90174.0
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 ... NaN 8.571429 1 1.0 9.573955 3.191318 3 79 9484.666667 NaN
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 ... 1.0 7.333333 NaN NaN 35.070325 3.188211 11 82 16812.500000 73988.0
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 ... NaN 7.500000 NaN NaN 0.000000 0.000000 0 83 41976.000000 85199.0
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 ... 1.0 8.500000 NaN NaN 47.824344 3.188290 15 116 15948.000000 104974.0

5 rows × 29 columns


In [457]:
len(sta_ri_bi_pa_str_sub_tr_tv_inc)


Out[457]:
452

In [458]:
sta_ri_bi_pa_str_sub_tr_tv_inc.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[458]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 ... metric park Rating_B subway_entrance score score_mean tree_count station_id AADT Median_Househould_Income
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 ... 254.000000 146.000000 444.000000 58.0 452.000000 452.000000 452.000000 452.000000 397.000000 344.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 ... 3.148046 1.219178 7.430723 1.0 29.709765 2.531139 10.232301 1292.373894 14851.612343 86523.139535
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 ... 0.906666 0.518528 0.726463 0.0 22.256510 1.062897 7.284009 1282.317184 13496.930454 39155.801884
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 ... 1.000000 1.000000 4.000000 1.0 0.000000 0.000000 0.000000 72.000000 815.000000 30686.000000
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 12.295779 2.293447 5.000000 338.500000 NaN NaN
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 27.596242 2.746607 10.000000 468.500000 NaN NaN
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 43.585371 3.181937 15.000000 3070.250000 NaN NaN
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 ... 5.000000 4.000000 9.111111 1.0 130.979643 4.889364 40.000000 3242.000000 136343.000000 250000.000000

8 rows × 26 columns


In [459]:
sta_ri_bi_pa_str_sub_tr_tv_inc.Median_Househould_Income.isnull().sum()


Out[459]:
108

Stations + Ridership + Bike Lanes + Parks + Street Quality + Subway Entrances + Trees + Traffic Volume + Income + Population Density


In [460]:
sta_ri_bi_pa_str_sub_tr_tv_inc_pop = pd.merge(sta_ri_bi_pa_str_sub_tr_tv_inc, pop_density, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr_tv_inc_pop.head()


Out[460]:
Station_id Station_Name Location Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 ... Rating_B line subway_entrance score score_mean tree_count station_id AADT Median_Househould_Income density
0 72 W 52 St & 11 Ave W 52 St & 11 Ave 40.767272 -73.993929 638.0 480.0 1026.0 1948.0 2943.0 ... 8.000000 NaN NaN 17.364799 2.480686 7 72 14870.500000 90174.0 0.000807
1 79 Franklin St & W Broadway Franklin St & W Broadway 40.719116 -74.006667 566.0 335.0 725.0 1728.0 2368.0 ... 8.571429 1 1.0 9.573955 3.191318 3 79 9484.666667 NaN 0.000631
2 82 St James Pl & Pearl St St James Pl & Pearl St 40.711174 -74.000165 310.0 276.0 406.0 788.0 1068.0 ... 7.333333 NaN NaN 35.070325 3.188211 11 82 16812.500000 73988.0 0.000511
3 83 Atlantic Ave & Fort Greene Pl Atlantic Ave & Fort Greene Pl 40.683826 -73.976323 258.0 162.0 281.0 749.0 1101.0 ... 7.500000 NaN NaN 0.000000 0.000000 0 83 41976.000000 85199.0 0.000231
4 116 W 17 St & 8 Ave W 17 St & 8 Ave 40.741776 -74.001497 1696.0 1193.0 2105.0 1933.0 2281.0 ... 8.500000 NaN NaN 47.824344 3.188290 15 116 15948.000000 104974.0 0.000742

5 rows × 30 columns


In [461]:
len(sta_ri_bi_pa_str_sub_tr_tv_inc_pop)


Out[461]:
452

In [462]:
sta_ri_bi_pa_str_sub_tr_tv_inc_pop.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[462]:
Station_id Latitude Longitude count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 ... park Rating_B subway_entrance score score_mean tree_count station_id AADT Median_Househould_Income density
count 452.000000 452.000000 452.000000 308.000000 306.000000 305.000000 304.000000 304.000000 304.000000 308.000000 ... 146.000000 444.000000 58.0 452.000000 452.000000 452.000000 452.000000 397.000000 344.000000 452.000000
mean 1292.373894 40.728453 -73.978022 848.538961 590.352941 1040.367213 1970.082237 2915.723684 2853.174342 3264.438312 ... 1.219178 7.430723 1.0 29.709765 2.531139 10.232301 1292.373894 14851.612343 86523.139535 0.000563
std 1282.317184 0.027500 0.020358 582.539528 425.804519 744.222048 1269.761682 1969.089494 1923.395346 2156.257807 ... 0.518528 0.726463 0.0 22.256510 1.062897 7.284009 1282.317184 13496.930454 39155.801884 0.000308
min 72.000000 40.678907 -74.017134 42.000000 21.000000 39.000000 59.000000 42.000000 2.000000 51.000000 ... 1.000000 4.000000 1.0 0.000000 0.000000 0.000000 72.000000 815.000000 30686.000000 0.000010
25% 338.500000 40.707529 -73.993396 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 12.295779 2.293447 5.000000 338.500000 NaN NaN 0.000344
50% 468.500000 40.725827 -73.980931 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 27.596242 2.746607 10.000000 468.500000 NaN NaN 0.000528
75% 3070.250000 40.750263 -73.960390 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 43.585371 3.181937 15.000000 3070.250000 NaN NaN 0.000739
max 3242.000000 40.787209 -73.929891 3186.000000 2143.000000 5916.000000 6945.000000 10388.000000 11052.000000 12359.000000 ... 4.000000 9.111111 1.0 130.979643 4.889364 40.000000 3242.000000 136343.000000 250000.000000 0.001877

8 rows × 27 columns


In [463]:
sta_ri_bi_pa_str_sub_tr_tv_inc_pop.density.isnull().sum()


Out[463]:
0

Clean Merged Master Dataset


In [464]:
master = sta_ri_bi_pa_str_sub_tr_tv_inc_pop.drop(['Station_Name', 'Location', 'Latitude', 
             'Longitude', 'line', 'score_mean', 'tree_count', 'station_id'],
            axis=1)
master.head()


Out[464]:
Station_id count of rides 201501 count of rides 201502 count of rides 201503 count of rides 201504 count of rides 201505 count of rides 201506 count of rides 201507 count of rides 201508 count of rides 201509 ... count of rides 201512 average station ridership 2015 metric park Rating_B subway_entrance score AADT Median_Househould_Income density
0 72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 ... 1648.0 2131.615385 NaN 1.0 8.000000 NaN 17.364799 14870.500000 90174.0 0.000807
1 79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 ... 1579.0 1760.538462 3.0 NaN 8.571429 1.0 9.573955 9484.666667 NaN 0.000631
2 82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 ... 713.0 766.538462 NaN 1.0 7.333333 NaN 35.070325 16812.500000 73988.0 0.000511
3 83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 ... 717.0 863.307692 NaN NaN 7.500000 NaN 0.000000 41976.000000 85199.0 0.000231
4 116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 NaN 47.824344 15948.000000 104974.0 0.000742

5 rows × 22 columns


In [465]:
master.columns = ['station_id', 'ridership_0115', 'ridership_0215', 'ridership_0315', 'ridership_0415',
                  'ridership_0515', 'ridership_0615', 'ridership_0715', 'ridership_0815', 'ridership_0915',
                  'ridership_1015', 'ridership_1115', 'ridership_1215', 'avg_ridership_2015', 'bike_lane_score',
                  'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume', 
                  'median_hh_income', 'pop_density']
master.set_index('station_id', inplace=True)
master.head()


Out[465]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
station_id
72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 ... 1648.0 2131.615385 NaN 1.0 8.000000 NaN 17.364799 14870.500000 90174.0 0.000807
79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 ... 1579.0 1760.538462 3.0 NaN 8.571429 1.0 9.573955 9484.666667 NaN 0.000631
82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 ... 713.0 766.538462 NaN 1.0 7.333333 NaN 35.070325 16812.500000 73988.0 0.000511
83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 ... 717.0 863.307692 NaN NaN 7.500000 NaN 0.000000 41976.000000 85199.0 0.000231
116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 NaN 47.824344 15948.000000 104974.0 0.000742

5 rows × 21 columns


In [466]:
# Fill park, subway_entrance, and bike_lane_score NaN values with 0
master.park.fillna(0, inplace=True)
master.subway_entrance.fillna(0, inplace=True)
master.bike_lane_score.fillna(0, inplace=True)
master.head()


Out[466]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
station_id
72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 ... 1648.0 2131.615385 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.0 0.000807
79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 ... 1579.0 1760.538462 3.0 0.0 8.571429 1.0 9.573955 9484.666667 NaN 0.000631
82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 ... 713.0 766.538462 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.0 0.000511
83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 ... 717.0 863.307692 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.0 0.000231
116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.0 0.000742

5 rows × 21 columns


In [467]:
# Fill street_quality_score NaN values with mean of column (there are only 5 missing)
master.street_quality_score.fillna(master.street_quality_score.mean(), inplace=True)
master.head()


Out[467]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
station_id
72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 ... 1648.0 2131.615385 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.0 0.000807
79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 ... 1579.0 1760.538462 3.0 0.0 8.571429 1.0 9.573955 9484.666667 NaN 0.000631
82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 ... 713.0 766.538462 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.0 0.000511
83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 ... 717.0 863.307692 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.0 0.000231
116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.0 0.000742

5 rows × 21 columns


In [468]:
# Fill traffic_volume NaN values with mean of column (there are 55 missing)
master.traffic_volume.fillna(master.traffic_volume.mean(), inplace=True)
master.head()


Out[468]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
station_id
72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 ... 1648.0 2131.615385 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.0 0.000807
79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 ... 1579.0 1760.538462 3.0 0.0 8.571429 1.0 9.573955 9484.666667 NaN 0.000631
82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 ... 713.0 766.538462 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.0 0.000511
83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 ... 717.0 863.307692 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.0 0.000231
116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.0 0.000742

5 rows × 21 columns


In [471]:
# Fill median_hh_income NaN values with mean of column (there are 108 missing)
master.median_hh_income.fillna(master.median_hh_income.mean(), inplace=True)
master.head()


Out[471]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
station_id
72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 ... 1648.0 2131.615385 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.000000 0.000807
79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 ... 1579.0 1760.538462 3.0 0.0 8.571429 1.0 9.573955 9484.666667 86523.139535 0.000631
82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 ... 713.0 766.538462 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.000000 0.000511
83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 ... 717.0 863.307692 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.000000 0.000231
116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.000000 0.000742

5 rows × 21 columns


In [472]:
master.isnull().sum()


Out[472]:
ridership_0115          144
ridership_0215          146
ridership_0315          147
ridership_0415          148
ridership_0515          148
ridership_0615          148
ridership_0715          144
ridership_0815           62
ridership_0915           26
ridership_1015           14
ridership_1115            7
ridership_1215            3
avg_ridership_2015        0
bike_lane_score           0
park                      0
street_quality_score      0
subway_entrance           0
tree_score                0
traffic_volume            0
median_hh_income          0
pop_density               0
dtype: int64

In [473]:
master.to_csv('../data/processed/master.csv')

In [ ]:


In [ ]: