In [391]:
import pandas as pd
import numpy as np
In [392]:
stations = pd.read_csv("../data/processed/stations.csv")
In [393]:
stations.head()
Out[393]:
In [394]:
len(stations)
Out[394]:
In [395]:
ridership = pd.read_csv("../data/processed/ridership.csv", index_col=0)
In [396]:
ridership.head()
Out[396]:
In [397]:
len(ridership)
Out[397]:
In [398]:
bike_lanes = pd.read_csv("../data/processed/bike-lane-quality.csv", index_col=0)
In [399]:
bike_lanes.head()
Out[399]:
In [400]:
len(bike_lanes)
Out[400]:
In [401]:
parks = pd.read_csv("../data/processed/parks.csv",
usecols=['Station_id', 'signname', 'park'])
In [402]:
parks.head()
Out[402]:
In [403]:
len(parks)
Out[403]:
In [404]:
len(parks.signname.unique())
Out[404]:
In [405]:
parks = parks.groupby(['Station_id']).sum()
parks.reset_index(inplace=True)
In [406]:
parks.head()
Out[406]:
In [407]:
parks.describe()
Out[407]:
In [408]:
street_qual = pd.read_csv("../data/processed/street-assessment.csv", index_col=0)
In [409]:
street_qual.head()
Out[409]:
In [410]:
len(street_qual)
Out[410]:
In [411]:
subway_ent = pd.read_csv("../data/processed/subway-entrances.csv", usecols=['Station_id', 'line', 'subway_entrance'])
In [412]:
subway_ent.head()
Out[412]:
In [413]:
len(subway_ent)
Out[413]:
In [414]:
trees = pd.read_csv("../data/processed/tree-canopy.csv")
In [415]:
trees.head()
Out[415]:
In [416]:
len(trees)
Out[416]:
In [417]:
traf_vol = pd.read_csv("../data/processed/traffic-volume.csv", index_col=0)
In [418]:
traf_vol.head()
Out[418]:
In [419]:
len(traf_vol)
Out[419]:
In [420]:
income = pd.read_csv('../data/processed/income.csv', usecols=['Station_id', 'Median_Househould_Income'])
In [421]:
income.head()
Out[421]:
In [422]:
income['Median_Househould_Income'] = income['Median_Househould_Income'].str.replace('[-,+]', '')
income = income.replace('',np.nan)
In [423]:
income['Median_Househould_Income'].unique()
Out[423]:
In [424]:
income.Median_Househould_Income = income['Median_Househould_Income'].astype('float')
In [425]:
len(income)
Out[425]:
In [426]:
pop_density = pd.read_csv('../data/processed/pop_density.csv', usecols=['Station_id', 'density'])
In [427]:
pop_density.head()
Out[427]:
In [428]:
len(pop_density)
Out[428]:
In [429]:
sta_ri = pd.merge(stations, ridership, on='Station_id', how='inner')
sta_ri.head()
Out[429]:
In [430]:
len(sta_ri)
Out[430]:
In [431]:
sta_ri.describe()
Out[431]:
In [432]:
sta_ri_bi = pd.merge(sta_ri, bike_lanes, on='Station_id', how='left')
sta_ri_bi.head()
Out[432]:
In [433]:
len(sta_ri_bi)
Out[433]:
In [434]:
sta_ri_bi.describe()
Out[434]:
In [435]:
sta_ri_bi.metric.isnull().sum()
Out[435]:
In [436]:
sta_ri_bi_pa = pd.merge(sta_ri_bi, parks, on='Station_id', how='left')
sta_ri_bi_pa.head()
Out[436]:
In [437]:
len(sta_ri_bi_pa)
Out[437]:
In [438]:
sta_ri_bi_pa.describe()
Out[438]:
In [439]:
sta_ri_bi_pa.park.isnull().sum()
Out[439]:
In [440]:
sta_ri_bi_pa_str = pd.merge(sta_ri_bi_pa, street_qual, on='Station_id', how='left')
sta_ri_bi_pa_str.head()
Out[440]:
In [441]:
len(sta_ri_bi_pa_str)
Out[441]:
In [442]:
sta_ri_bi_pa_str.describe()
Out[442]:
In [443]:
sta_ri_bi_pa_str.Rating_B.isnull().sum()
Out[443]:
In [444]:
sta_ri_bi_pa_str_sub = pd.merge(sta_ri_bi_pa_str, subway_ent, on='Station_id', how='left')
sta_ri_bi_pa_str_sub.head()
Out[444]:
In [445]:
len(sta_ri_bi_pa_str_sub)
Out[445]:
In [446]:
sta_ri_bi_pa_str_sub.describe()
Out[446]:
In [447]:
sta_ri_bi_pa_str_sub.subway_entrance.isnull().sum()
Out[447]:
In [448]:
sta_ri_bi_pa_str_sub_tr = pd.merge(sta_ri_bi_pa_str_sub, trees, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr.head()
Out[448]:
In [449]:
len(sta_ri_bi_pa_str_sub_tr)
Out[449]:
In [450]:
sta_ri_bi_pa_str_sub_tr.describe()
Out[450]:
In [451]:
sta_ri_bi_pa_str_sub_tr.score_mean.isnull().sum()
Out[451]:
In [452]:
sta_ri_bi_pa_str_sub_tr_tv = pd.merge(sta_ri_bi_pa_str_sub_tr, traf_vol, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr_tv.head()
Out[452]:
In [453]:
len(sta_ri_bi_pa_str_sub_tr_tv)
Out[453]:
In [454]:
sta_ri_bi_pa_str_sub_tr_tv.describe()
Out[454]:
In [455]:
sta_ri_bi_pa_str_sub_tr_tv.AADT.isnull().sum()
Out[455]:
In [456]:
sta_ri_bi_pa_str_sub_tr_tv_inc = pd.merge(sta_ri_bi_pa_str_sub_tr_tv, income, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr_tv_inc.head()
Out[456]:
In [457]:
len(sta_ri_bi_pa_str_sub_tr_tv_inc)
Out[457]:
In [458]:
sta_ri_bi_pa_str_sub_tr_tv_inc.describe()
Out[458]:
In [459]:
sta_ri_bi_pa_str_sub_tr_tv_inc.Median_Househould_Income.isnull().sum()
Out[459]:
In [460]:
sta_ri_bi_pa_str_sub_tr_tv_inc_pop = pd.merge(sta_ri_bi_pa_str_sub_tr_tv_inc, pop_density, on='Station_id', how='left')
sta_ri_bi_pa_str_sub_tr_tv_inc_pop.head()
Out[460]:
In [461]:
len(sta_ri_bi_pa_str_sub_tr_tv_inc_pop)
Out[461]:
In [462]:
sta_ri_bi_pa_str_sub_tr_tv_inc_pop.describe()
Out[462]:
In [463]:
sta_ri_bi_pa_str_sub_tr_tv_inc_pop.density.isnull().sum()
Out[463]:
In [464]:
master = sta_ri_bi_pa_str_sub_tr_tv_inc_pop.drop(['Station_Name', 'Location', 'Latitude',
'Longitude', 'line', 'score_mean', 'tree_count', 'station_id'],
axis=1)
master.head()
Out[464]:
In [465]:
master.columns = ['station_id', 'ridership_0115', 'ridership_0215', 'ridership_0315', 'ridership_0415',
'ridership_0515', 'ridership_0615', 'ridership_0715', 'ridership_0815', 'ridership_0915',
'ridership_1015', 'ridership_1115', 'ridership_1215', 'avg_ridership_2015', 'bike_lane_score',
'park', 'street_quality_score', 'subway_entrance', 'tree_score', 'traffic_volume',
'median_hh_income', 'pop_density']
master.set_index('station_id', inplace=True)
master.head()
Out[465]:
In [466]:
# Fill park, subway_entrance, and bike_lane_score NaN values with 0
master.park.fillna(0, inplace=True)
master.subway_entrance.fillna(0, inplace=True)
master.bike_lane_score.fillna(0, inplace=True)
master.head()
Out[466]:
In [467]:
# Fill street_quality_score NaN values with mean of column (there are only 5 missing)
master.street_quality_score.fillna(master.street_quality_score.mean(), inplace=True)
master.head()
Out[467]:
In [468]:
# Fill traffic_volume NaN values with mean of column (there are 55 missing)
master.traffic_volume.fillna(master.traffic_volume.mean(), inplace=True)
master.head()
Out[468]:
In [471]:
# Fill median_hh_income NaN values with mean of column (there are 108 missing)
master.median_hh_income.fillna(master.median_hh_income.mean(), inplace=True)
master.head()
Out[471]:
In [472]:
master.isnull().sum()
Out[472]:
In [473]:
master.to_csv('../data/processed/master.csv')
In [ ]:
In [ ]: