In [11]:
# Import necessary packages
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
In [12]:
# Load master file into dataframe
master = pd.read_csv('../data/processed/master.csv')
master.set_index('station_id', inplace=True)
master.head()
Out[12]:
ridership_0115
ridership_0215
ridership_0315
ridership_0415
ridership_0515
ridership_0615
ridership_0715
ridership_0815
ridership_0915
ridership_1015
...
ridership_1215
avg_ridership_2015
bike_lane_score
park
street_quality_score
subway_entrance
tree_score
traffic_volume
median_hh_income
pop_density
station_id
72
638.0
480.0
1026.0
1948.0
2943.0
2767.0
3149.0
3504.0
3667.0
3546.0
...
1648.0
2131.615385
0.0
1.0
8.000000
0.0
17.364799
14870.500000
90174.000000
0.000807
79
566.0
335.0
725.0
1728.0
2368.0
2424.0
2626.0
2726.0
3011.0
2646.0
...
1579.0
1760.538462
3.0
0.0
8.571429
1.0
9.573955
9484.666667
86523.139535
0.000631
82
310.0
276.0
406.0
788.0
1068.0
946.0
1193.0
1145.0
1166.0
1053.0
...
713.0
766.538462
0.0
1.0
7.333333
0.0
35.070325
16812.500000
73988.000000
0.000511
83
258.0
162.0
281.0
749.0
1101.0
994.0
1659.0
1724.0
1505.0
1104.0
...
717.0
863.307692
0.0
0.0
7.500000
0.0
0.000000
41976.000000
85199.000000
0.000231
116
1696.0
1193.0
2105.0
1933.0
2281.0
4800.0
5674.0
6175.0
6558.0
5825.0
...
3686.0
3576.692308
4.0
1.0
8.500000
0.0
47.824344
15948.000000
104974.000000
0.000742
5 rows × 21 columns
In [13]:
# Normalize
master_temp = master.drop(['park', 'subway_entrance'], axis =1)
master_norm = (master_temp - master_temp.mean()) / (master_temp.std())
master_norm['park'] = master['park']
master_norm['subway_entrance'] = master['subway_entrance']
master_norm.head()
Out[13]:
ridership_0115
ridership_0215
ridership_0315
ridership_0415
ridership_0515
ridership_0615
ridership_0715
ridership_0815
ridership_0915
ridership_1015
...
ridership_1215
avg_ridership_2015
bike_lane_score
street_quality_score
tree_score
traffic_volume
median_hh_income
pop_density
park
subway_entrance
station_id
72
-0.361416
-0.259163
-0.019305
-0.017391
0.013852
-0.044803
-0.053536
0.290089
0.380089
0.450563
...
-0.037283
0.134845
-1.037727
0.790673
-0.554668
0.001493
1.069154e-01
0.791144
1.0
0.0
79
-0.485013
-0.599695
-0.423754
-0.190652
-0.278161
-0.223134
-0.296086
-0.030448
0.089412
0.014532
...
-0.087781
-0.153108
0.722092
1.584334
-0.904716
-0.424358
2.130764e-15
0.218505
0.0
1.0
82
-0.924468
-0.738256
-0.852390
-0.930948
-0.938365
-0.991566
-0.960664
-0.681822
-0.728118
-0.757243
...
-0.721558
-0.924446
-1.037727
-0.135265
0.240854
0.155045
-3.670914e-01
-0.170810
1.0
0.0
83
-1.013732
-1.005985
-1.020350
-0.961663
-0.921605
-0.966611
-0.744548
-0.443273
-0.577905
-0.732535
...
-0.718631
-0.849354
-1.037727
0.096220
-1.334880
2.144695
-3.877741e-02
-1.079345
0.0
0.0
116
1.454770
1.415314
1.430531
-0.029204
-0.322344
1.012182
1.117474
1.390545
1.661106
1.554691
...
1.454217
1.256217
1.308698
1.485126
0.813900
0.086690
5.403333e-01
0.581773
1.0
0.0
5 rows × 21 columns
In [14]:
master_norm.describe()
/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
Out[14]:
ridership_0115
ridership_0215
ridership_0315
ridership_0415
ridership_0515
ridership_0615
ridership_0715
ridership_0815
ridership_0915
ridership_1015
...
ridership_1215
avg_ridership_2015
bike_lane_score
street_quality_score
tree_score
traffic_volume
median_hh_income
pop_density
park
subway_entrance
count
3.080000e+02
3.060000e+02
3.050000e+02
3.040000e+02
3.040000e+02
3.040000e+02
3.080000e+02
3.900000e+02
4.260000e+02
4.380000e+02
...
4.490000e+02
4.520000e+02
4.520000e+02
4.520000e+02
4.520000e+02
4.520000e+02
4.520000e+02
4.520000e+02
452.000000
452.000000
mean
5.154607e-17
-3.800518e-17
7.516756e-17
5.843279e-18
-4.784185e-17
-3.652049e-18
1.658125e-17
-1.594166e-17
1.402113e-16
-4.258390e-17
...
2.522110e-17
4.077368e-16
-2.603620e-17
-4.537361e-15
8.363516e-17
1.411727e-15
2.072457e-15
-4.370275e-16
0.393805
0.128319
std
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
...
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
1.000000e+00
0.642029
0.334815
min
-1.384522e+00
-1.337123e+00
-1.345522e+00
-1.505072e+00
-1.459418e+00
-1.482365e+00
-1.490285e+00
-1.147384e+00
-1.241233e+00
-1.262555e+00
...
-1.233850e+00
-1.401265e+00
-1.037727e+00
-4.764952e+00
-1.334880e+00
-1.109859e+00
-1.635190e+00
-1.798457e+00
0.000000
0.000000
25%
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
-8.407983e-01
-1.037727e+00
-5.982333e-01
-7.824221e-01
-5.788327e-01
-5.198786e-01
-7.150162e-01
0.000000
0.000000
50%
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
-2.042761e-01
1.354856e-01
9.621976e-02
-9.496200e-02
-1.409492e-01
2.130764e-15
-1.148388e-01
0.000000
0.000000
75%
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
5.971767e-01
7.220917e-01
7.906729e-01
6.234404e-01
2.399850e-01
4.580717e-01
5.694449e-01
1.000000
0.000000
max
4.012536e+00
3.646385e+00
6.551315e+00
3.917993e+00
3.794788e+00
4.262684e+00
4.217752e+00
4.042186e+00
4.986611e+00
4.226593e+00
...
5.228342e+00
4.768187e+00
1.895304e+00
2.333902e+00
4.550124e+00
9.606190e+00
4.787418e+00
4.269926e+00
4.000000
1.000000
8 rows × 21 columns
In [7]:
# Output to CSV
master_norm.to_csv('../data/processed/master_norm.csv')
In [ ]:
Content source: pichot/citibike-publicspace
Similar notebooks: