In [2]:
from collections import defaultdict
from dateutil.parser import parse
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline
In [3]:
# various options in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)
In [4]:
def read_mta(file_nums):
path ='http://web.mta.info/developers/data/nyct/turnstile/turnstile_'
allFiles = []
for i in file_nums:
allFiles.append(path + i + ".txt")
df_list = [pd.read_csv(file) for file in allFiles]
df = pd.concat(df_list)
df.columns = df.columns.str.strip()
return df
In [5]:
# March data
df = read_mta(['170325', '170318', '170311', '170304'])
In [6]:
df.columns
Out[6]:
In [7]:
# remove duplicates
df = df[df.DESC != 'RECOVR AUD']
df = df[df.TIME != '04:01:13']
# Sanity check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(df
.groupby(['C/A', 'UNIT', 'SCP', 'STATION', 'DATE', 'TIME'])
.ENTRIES.count()
.reset_index()
.sort_values("ENTRIES", ascending=False)).head()
Out[7]:
In [8]:
# data is at a turnstile level
df.head()
Out[8]:
In [9]:
df['DATETIME'] = pd.to_datetime(df.DATE + ' ' + df.TIME)
In [10]:
df = df.set_index(['DATETIME'])
#df.head()
In [11]:
# filter dataset to 4am to noon
morning = df.between_time('04:00:00', '12:00:00')
morning.head()
Out[11]:
In [12]:
# subtract max from min for each turnstile to get exits
morning_turn = (morning.groupby(['C/A', 'UNIT', 'SCP', 'STATION', 'DATE'])['EXITS'].max()\
-morning.groupby(['C/A', 'UNIT', 'SCP', 'STATION', 'DATE'])['EXITS'].min()).reset_index()
In [13]:
#explore outliers
morning_turn['EXITS'].describe()
Out[13]:
In [14]:
# oddly high value for 47-50
morning_turn.loc[morning_turn['STATION'] == '47-50 STS ROCK']['EXITS'].sort_values(ascending = True).tail(10)
Out[14]:
In [15]:
morning_turn['EXITS'].sort_values(ascending = True).tail(30)
Out[15]:
In [16]:
# removing values that appear to be counter resets
morning_turn = morning_turn[morning_turn.EXITS <= 11000]
In [17]:
morning_turn.head()
#morning_turn.shape
Out[17]:
In [18]:
# add up total morning exits per station
morning_station = morning_turn.groupby(['STATION', 'DATE']).sum().reset_index()
morning_station['DATE'] = pd.to_datetime(morning_station['DATE'])
morning_station.head()
Out[18]:
In [19]:
# summing for whole month
morning_month = morning_station.groupby(['STATION'])['EXITS'].sum().reset_index()
morning_month = morning_month.set_index('STATION')
In [20]:
# determine busiest stations
topsts = morning_month.sort_values('EXITS', ascending=False).head(15)
topsts.head()
topsts.plot(kind='barh', title='Morning Exits by Station')
Out[20]:
In [ ]:
In [21]:
# df.head()
In [22]:
# limit to Grand Central at turnstile level
grdcentral = df[df['STATION'] == 'GRD CNTRL-42 ST']
In [23]:
grdcentral.shape
Out[23]:
In [24]:
grdcentral.head()
Out[24]:
In [25]:
turnstiles = grdcentral['SCP'].unique()
In [26]:
masked_dfs = []
for i in turnstiles:
mask = grdcentral[((grdcentral["C/A"] == "R236") &
(grdcentral["UNIT"] == "R045") &
(grdcentral["SCP"] == i) &
(grdcentral["STATION"] == "GRD CNTRL-42 ST"))]
mask['DIFFS'] = mask['EXITS'].diff()
masked_dfs.append(mask)
In [27]:
exits_turn = pd.concat(masked_dfs)
In [28]:
exits_turn.head()
Out[28]:
In [29]:
#remove NaN values and negative values (i.e. counter resets)
exits_turn.dropna(inplace=True)
exits_turn = exits_turn[exits_turn['DIFFS'] >= 0]
In [30]:
# now have turnstile exits per four period (diffs column)
exits_turn.tail()
Out[30]:
In [31]:
# add exits across turnstiles by time
exits_turn = exits_turn.reset_index()
ct = exits_turn.groupby(['DATETIME'])['DIFFS'].sum().reset_index()
In [32]:
ct.set_index(['DATETIME'], inplace=True)
ct.head()
Out[32]:
In [33]:
we1 = ct[(ct.index > '2017-02-25 00:00:00') & (ct.index < '2017-02-27 05:00:00')]
wd1 = ct[(ct.index > '2017-02-27 00:00:00') & (ct.index < '2017-03-04 05:00:00')]
we2 = ct[(ct.index > '2017-03-04 00:00:00') & (ct.index < '2017-03-06 05:00:00')]
wd2 = ct[(ct.index > '2017-03-06 00:00:00') & (ct.index < '2017-03-11 05:00:00')]
we3 = ct[(ct.index > '2017-03-11 00:00:00') & (ct.index < '2017-03-13 05:00:00')]
wd3 = ct[(ct.index > '2017-03-13 00:00:00') & (ct.index < '2017-03-18 05:00:00')]
we4 = ct[(ct.index > '2017-03-18 00:00:00') & (ct.index < '2017-03-20 05:00:00')]
wd4 = ct[(ct.index > '2017-03-20 00:00:00') & (ct.index < '2017-03-25 05:00:00')]
In [34]:
plt.figure(figsize=(12,8))
plt.xticks(rotation=70)
plt.plot(we1, color = 'navy')
plt.plot(wd1, color = '#008080')
plt.plot(we2, color = 'navy')
plt.plot(wd2, color = '#008080')
plt.plot(we3, color = 'navy')
plt.plot(wd3, color = '#008080')
plt.plot(we4, color = 'navy')
plt.plot(wd4, color = '#008080')
plt.title('Number of People Exiting Grand Central for the Month of March')
plt.ylabel('People Exiting')
Out[34]:
In [35]:
plt.figure(figsize=(12,8))
plt.xticks(rotation=70)
plt.plot(we2, color = 'navy')
plt.plot(wd2, color = '#008080')
plt.title('Number of People Exiting Grand Central for One Week')
plt.ylabel('People Exiting')
Out[35]:
In [ ]:
In [39]:
plt.figure(figsize=(12,8))
plt.xticks(rotation=200)
weekday1 = ct[(ct.index > '2017-03-05 21:00:00') & (ct.index < '2017-03-07 03:00:00')]
weekday1['DIFFS'].plot(title = 'Number of People Exiting Grand Central for One Day', color = '#008080' )
Out[39]:
In [ ]:
In [ ]:
# weekly plot without weekends separated
# wk1 = ct[(ct.index > '2017-02-25 00:00:00') & (ct.index < '2017-03-03 23:59:59')]
# wk2 = ct[(ct.index > '2017-03-04 00:00:00') & (ct.index < '2017-03-10 23:59:59')]
# wk3 = ct[(ct.index > '2017-03-11 00:00:00') & (ct.index < '2017-03-17 23:59:59')]
# wk4 = ct[(ct.index > '2017-03-18 00:00:00') & (ct.index < '2017-03-24 23:59:59')]
# plt.plot(wk1)
# plt.plot(wk2)
# plt.plot(wk3)
# plt.plot(wk4)
In [ ]:
# daily plots (too few points to be that useful)
# day1 = ct[(ct.index > '2017-02-25 00:00:00') & (ct.index < '2017-02-25 23:59:59')]
# day2 = ct[(ct.index > '2017-02-26 00:00:00') & (ct.index < '2017-02-26 23:59:59')]
# day3 = ct[(ct.index > '2017-02-27 00:00:00') & (ct.index < '2017-02-27 23:59:59')]
# day4 = ct[(ct.index > '2017-02-28 00:00:00') & (ct.index < '2017-02-28 23:59:59')]
# day5 = ct[(ct.index > '2017-03-01 00:00:00') & (ct.index < '2017-03-01 23:59:59')]
# day6 = ct[(ct.index > '2017-03-02 00:00:00') & (ct.index < '2017-03-02 23:59:59')]
# day7 = ct[(ct.index > '2017-03-03 00:00:00') & (ct.index < '2017-03-03 23:59:59')]
# plt.close()
# day1.plot()
# day2.plot()
# day3.plot()
# day4.plot()
# day5.plot()
# day6.plot()
# day7.plot()
In [ ]: