Normalize Data

  • Normalize each column using the standard score

In [11]:
# Import necessary packages
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [12]:
# Load master file into dataframe
master = pd.read_csv('../data/processed/master.csv')
master.set_index('station_id', inplace=True)
master.head()


Out[12]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score park street_quality_score subway_entrance tree_score traffic_volume median_hh_income pop_density
station_id
72 638.0 480.0 1026.0 1948.0 2943.0 2767.0 3149.0 3504.0 3667.0 3546.0 ... 1648.0 2131.615385 0.0 1.0 8.000000 0.0 17.364799 14870.500000 90174.000000 0.000807
79 566.0 335.0 725.0 1728.0 2368.0 2424.0 2626.0 2726.0 3011.0 2646.0 ... 1579.0 1760.538462 3.0 0.0 8.571429 1.0 9.573955 9484.666667 86523.139535 0.000631
82 310.0 276.0 406.0 788.0 1068.0 946.0 1193.0 1145.0 1166.0 1053.0 ... 713.0 766.538462 0.0 1.0 7.333333 0.0 35.070325 16812.500000 73988.000000 0.000511
83 258.0 162.0 281.0 749.0 1101.0 994.0 1659.0 1724.0 1505.0 1104.0 ... 717.0 863.307692 0.0 0.0 7.500000 0.0 0.000000 41976.000000 85199.000000 0.000231
116 1696.0 1193.0 2105.0 1933.0 2281.0 4800.0 5674.0 6175.0 6558.0 5825.0 ... 3686.0 3576.692308 4.0 1.0 8.500000 0.0 47.824344 15948.000000 104974.000000 0.000742

5 rows × 21 columns


In [13]:
# Normalize
master_temp = master.drop(['park', 'subway_entrance'], axis =1)
master_norm = (master_temp - master_temp.mean()) / (master_temp.std())
master_norm['park'] = master['park']
master_norm['subway_entrance'] = master['subway_entrance']
master_norm.head()


Out[13]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score street_quality_score tree_score traffic_volume median_hh_income pop_density park subway_entrance
station_id
72 -0.361416 -0.259163 -0.019305 -0.017391 0.013852 -0.044803 -0.053536 0.290089 0.380089 0.450563 ... -0.037283 0.134845 -1.037727 0.790673 -0.554668 0.001493 1.069154e-01 0.791144 1.0 0.0
79 -0.485013 -0.599695 -0.423754 -0.190652 -0.278161 -0.223134 -0.296086 -0.030448 0.089412 0.014532 ... -0.087781 -0.153108 0.722092 1.584334 -0.904716 -0.424358 2.130764e-15 0.218505 0.0 1.0
82 -0.924468 -0.738256 -0.852390 -0.930948 -0.938365 -0.991566 -0.960664 -0.681822 -0.728118 -0.757243 ... -0.721558 -0.924446 -1.037727 -0.135265 0.240854 0.155045 -3.670914e-01 -0.170810 1.0 0.0
83 -1.013732 -1.005985 -1.020350 -0.961663 -0.921605 -0.966611 -0.744548 -0.443273 -0.577905 -0.732535 ... -0.718631 -0.849354 -1.037727 0.096220 -1.334880 2.144695 -3.877741e-02 -1.079345 0.0 0.0
116 1.454770 1.415314 1.430531 -0.029204 -0.322344 1.012182 1.117474 1.390545 1.661106 1.554691 ... 1.454217 1.256217 1.308698 1.485126 0.813900 0.086690 5.403333e-01 0.581773 1.0 0.0

5 rows × 21 columns


In [14]:
master_norm.describe()


/Users/Danny1/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[14]:
ridership_0115 ridership_0215 ridership_0315 ridership_0415 ridership_0515 ridership_0615 ridership_0715 ridership_0815 ridership_0915 ridership_1015 ... ridership_1215 avg_ridership_2015 bike_lane_score street_quality_score tree_score traffic_volume median_hh_income pop_density park subway_entrance
count 3.080000e+02 3.060000e+02 3.050000e+02 3.040000e+02 3.040000e+02 3.040000e+02 3.080000e+02 3.900000e+02 4.260000e+02 4.380000e+02 ... 4.490000e+02 4.520000e+02 4.520000e+02 4.520000e+02 4.520000e+02 4.520000e+02 4.520000e+02 4.520000e+02 452.000000 452.000000
mean 5.154607e-17 -3.800518e-17 7.516756e-17 5.843279e-18 -4.784185e-17 -3.652049e-18 1.658125e-17 -1.594166e-17 1.402113e-16 -4.258390e-17 ... 2.522110e-17 4.077368e-16 -2.603620e-17 -4.537361e-15 8.363516e-17 1.411727e-15 2.072457e-15 -4.370275e-16 0.393805 0.128319
std 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 ... 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 0.642029 0.334815
min -1.384522e+00 -1.337123e+00 -1.345522e+00 -1.505072e+00 -1.459418e+00 -1.482365e+00 -1.490285e+00 -1.147384e+00 -1.241233e+00 -1.262555e+00 ... -1.233850e+00 -1.401265e+00 -1.037727e+00 -4.764952e+00 -1.334880e+00 -1.109859e+00 -1.635190e+00 -1.798457e+00 0.000000 0.000000
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN -8.407983e-01 -1.037727e+00 -5.982333e-01 -7.824221e-01 -5.788327e-01 -5.198786e-01 -7.150162e-01 0.000000 0.000000
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN -2.042761e-01 1.354856e-01 9.621976e-02 -9.496200e-02 -1.409492e-01 2.130764e-15 -1.148388e-01 0.000000 0.000000
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 5.971767e-01 7.220917e-01 7.906729e-01 6.234404e-01 2.399850e-01 4.580717e-01 5.694449e-01 1.000000 0.000000
max 4.012536e+00 3.646385e+00 6.551315e+00 3.917993e+00 3.794788e+00 4.262684e+00 4.217752e+00 4.042186e+00 4.986611e+00 4.226593e+00 ... 5.228342e+00 4.768187e+00 1.895304e+00 2.333902e+00 4.550124e+00 9.606190e+00 4.787418e+00 4.269926e+00 4.000000 1.000000

8 rows × 21 columns


In [7]:
# Output to CSV
master_norm.to_csv('../data/processed/master_norm.csv')

In [ ]: