In [1]:
# import the necessary modules
import numpy as np
import pandas as pd
In [3]:
# read in the csv datasets
subway_df = pd.read_csv("nyc_subway_weather.csv")
In [8]:
# Explore the data a bit
In [9]:
print subway_df.head()
UNIT DATEn TIMEn ENTRIESn EXITSn ENTRIESn_hourly \
0 R003 05-01-11 00:00:00 4388333 2911002 0.0
1 R003 05-01-11 04:00:00 4388333 2911002 0.0
2 R003 05-01-11 12:00:00 4388333 2911002 0.0
3 R003 05-01-11 16:00:00 4388333 2911002 0.0
4 R003 05-01-11 20:00:00 4388333 2911002 0.0
EXITSn_hourly datetime hour day_week ... pressurei \
0 0.0 2011-05-01 00:00:00 0 6 ... 30.22
1 0.0 2011-05-01 04:00:00 4 6 ... 30.25
2 0.0 2011-05-01 12:00:00 12 6 ... 30.28
3 0.0 2011-05-01 16:00:00 16 6 ... 30.26
4 0.0 2011-05-01 20:00:00 20 6 ... 30.28
rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi \
0 0 55.9 3.5 0.0 30.258 55.98 7.86
1 0 52.0 3.5 0.0 30.258 55.98 7.86
2 0 62.1 6.9 0.0 30.258 55.98 7.86
3 0 57.9 15.0 0.0 30.258 55.98 7.86
4 0 52.0 10.4 0.0 30.258 55.98 7.86
weather_lat weather_lon
0 40.700348 -73.887177
1 40.700348 -73.887177
2 40.700348 -73.887177
3 40.700348 -73.887177
4 40.700348 -73.887177
[5 rows x 27 columns]
In [10]:
print subway_df.describe()
ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly \
count 4.264900e+04 4.264900e+04 42649.000000 42649.000000
mean 2.812486e+07 1.986993e+07 1886.589955 1361.487866
std 3.043607e+07 2.028986e+07 2952.385585 2183.845409
min 0.000000e+00 0.000000e+00 0.000000 0.000000
25% 1.039762e+07 7.613712e+06 274.000000 237.000000
50% 1.818389e+07 1.331609e+07 905.000000 664.000000
75% 3.263049e+07 2.393771e+07 2255.000000 1537.000000
max 2.357746e+08 1.493782e+08 32814.000000 34828.000000
hour day_week weekday latitude longitude \
count 42649.000000 42649.000000 42649.000000 42649.000000 42649.000000
mean 10.046754 2.905719 0.714436 40.724647 -73.940364
std 6.938928 2.079231 0.451688 0.071650 0.059713
min 0.000000 0.000000 0.000000 40.576152 -74.073622
25% 4.000000 1.000000 0.000000 40.677107 -73.987342
50% 12.000000 3.000000 1.000000 40.717241 -73.953459
75% 16.000000 5.000000 1.000000 40.759123 -73.907733
max 20.000000 6.000000 1.000000 40.889185 -73.755383
fog ... pressurei rain tempi \
count 42649.000000 ... 42649.000000 42649.000000 42649.000000
mean 0.009824 ... 29.971096 0.224741 63.103780
std 0.098631 ... 0.137942 0.417417 8.455597
min 0.000000 ... 29.550000 0.000000 46.900000
25% 0.000000 ... 29.890000 0.000000 57.000000
50% 0.000000 ... 29.960000 0.000000 61.000000
75% 0.000000 ... 30.060000 0.000000 69.100000
max 1.000000 ... 30.320000 1.000000 86.000000
wspdi meanprecipi meanpressurei meantempi meanwspdi \
count 42649.000000 42649.000000 42649.000000 42649.000000 42649.000000
mean 6.927872 0.004618 29.971096 63.103780 6.927872
std 4.510178 0.016344 0.131158 6.939011 3.179832
min 0.000000 0.000000 29.590000 49.400000 0.000000
25% 4.600000 0.000000 29.913333 58.283333 4.816667
50% 6.900000 0.000000 29.958000 60.950000 6.166667
75% 9.200000 0.000000 30.060000 67.466667 8.850000
max 23.000000 0.157500 30.293333 79.800000 17.083333
weather_lat weather_lon
count 42649.000000 42649.000000
mean 40.728555 -73.938693
std 0.065420 0.059582
min 40.600204 -74.014870
25% 40.688591 -73.985130
50% 40.720570 -73.949150
75% 40.755226 -73.912033
max 40.862064 -73.694176
[8 rows x 21 columns]
In [11]:
def correlation(x, y):
std_x = (x - x.mean()) / x.std(ddof=0)
std_y = (y - y.mean()) / y.std(ddof=0)
return (std_x * std_y).mean()
In [ ]:
Content source: harish-garg/Data-Analysis
Similar notebooks: