In [1]:
# import the necessary modules
import numpy as np
import pandas as pd

In [3]:
# read in the csv datasets
subway_df = pd.read_csv("nyc_subway_weather.csv")

In [8]:
# Explore the data a bit

In [9]:
print subway_df.head()


   UNIT     DATEn     TIMEn  ENTRIESn   EXITSn  ENTRIESn_hourly  \
0  R003  05-01-11  00:00:00   4388333  2911002              0.0   
1  R003  05-01-11  04:00:00   4388333  2911002              0.0   
2  R003  05-01-11  12:00:00   4388333  2911002              0.0   
3  R003  05-01-11  16:00:00   4388333  2911002              0.0   
4  R003  05-01-11  20:00:00   4388333  2911002              0.0   

   EXITSn_hourly             datetime  hour  day_week     ...       pressurei  \
0            0.0  2011-05-01 00:00:00     0         6     ...           30.22   
1            0.0  2011-05-01 04:00:00     4         6     ...           30.25   
2            0.0  2011-05-01 12:00:00    12         6     ...           30.28   
3            0.0  2011-05-01 16:00:00    16         6     ...           30.26   
4            0.0  2011-05-01 20:00:00    20         6     ...           30.28   

  rain  tempi  wspdi meanprecipi  meanpressurei  meantempi  meanwspdi  \
0    0   55.9    3.5         0.0         30.258      55.98       7.86   
1    0   52.0    3.5         0.0         30.258      55.98       7.86   
2    0   62.1    6.9         0.0         30.258      55.98       7.86   
3    0   57.9   15.0         0.0         30.258      55.98       7.86   
4    0   52.0   10.4         0.0         30.258      55.98       7.86   

   weather_lat  weather_lon  
0    40.700348   -73.887177  
1    40.700348   -73.887177  
2    40.700348   -73.887177  
3    40.700348   -73.887177  
4    40.700348   -73.887177  

[5 rows x 27 columns]

In [10]:
print subway_df.describe()


           ENTRIESn        EXITSn  ENTRIESn_hourly  EXITSn_hourly  \
count  4.264900e+04  4.264900e+04     42649.000000   42649.000000   
mean   2.812486e+07  1.986993e+07      1886.589955    1361.487866   
std    3.043607e+07  2.028986e+07      2952.385585    2183.845409   
min    0.000000e+00  0.000000e+00         0.000000       0.000000   
25%    1.039762e+07  7.613712e+06       274.000000     237.000000   
50%    1.818389e+07  1.331609e+07       905.000000     664.000000   
75%    3.263049e+07  2.393771e+07      2255.000000    1537.000000   
max    2.357746e+08  1.493782e+08     32814.000000   34828.000000   

               hour      day_week       weekday      latitude     longitude  \
count  42649.000000  42649.000000  42649.000000  42649.000000  42649.000000   
mean      10.046754      2.905719      0.714436     40.724647    -73.940364   
std        6.938928      2.079231      0.451688      0.071650      0.059713   
min        0.000000      0.000000      0.000000     40.576152    -74.073622   
25%        4.000000      1.000000      0.000000     40.677107    -73.987342   
50%       12.000000      3.000000      1.000000     40.717241    -73.953459   
75%       16.000000      5.000000      1.000000     40.759123    -73.907733   
max       20.000000      6.000000      1.000000     40.889185    -73.755383   

                fog      ...          pressurei          rain         tempi  \
count  42649.000000      ...       42649.000000  42649.000000  42649.000000   
mean       0.009824      ...          29.971096      0.224741     63.103780   
std        0.098631      ...           0.137942      0.417417      8.455597   
min        0.000000      ...          29.550000      0.000000     46.900000   
25%        0.000000      ...          29.890000      0.000000     57.000000   
50%        0.000000      ...          29.960000      0.000000     61.000000   
75%        0.000000      ...          30.060000      0.000000     69.100000   
max        1.000000      ...          30.320000      1.000000     86.000000   

              wspdi   meanprecipi  meanpressurei     meantempi     meanwspdi  \
count  42649.000000  42649.000000   42649.000000  42649.000000  42649.000000   
mean       6.927872      0.004618      29.971096     63.103780      6.927872   
std        4.510178      0.016344       0.131158      6.939011      3.179832   
min        0.000000      0.000000      29.590000     49.400000      0.000000   
25%        4.600000      0.000000      29.913333     58.283333      4.816667   
50%        6.900000      0.000000      29.958000     60.950000      6.166667   
75%        9.200000      0.000000      30.060000     67.466667      8.850000   
max       23.000000      0.157500      30.293333     79.800000     17.083333   

        weather_lat   weather_lon  
count  42649.000000  42649.000000  
mean      40.728555    -73.938693  
std        0.065420      0.059582  
min       40.600204    -74.014870  
25%       40.688591    -73.985130  
50%       40.720570    -73.949150  
75%       40.755226    -73.912033  
max       40.862064    -73.694176  

[8 rows x 21 columns]

In [11]:
def correlation(x, y):
    std_x = (x - x.mean()) / x.std(ddof=0)
    std_y = (y - y.mean()) / y.std(ddof=0)
    
    return (std_x * std_y).mean()

In [ ]: