Topic: Challenge Set 1
Subject: Explore MTA turnstile data
Date: 01/23/2017
Name: Prashant Tatineni
Challenge 1
In [1]:
import pandas as pd
In [2]:
wk1 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_161224.txt')
wk2 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_161231.txt')
wk3 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_170107.txt')
wk4 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_170114.txt')
In [3]:
wk1.columns = ['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
wk2.columns = ['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
wk3.columns = ['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
wk4.columns = ['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
In [9]:
MTA_dict = {}
for index, row in wk1.iterrows():
MTA_dict.setdefault((row['C/A'],row['UNIT'],row['SCP'],row['STATION']),[]).append([row['LINENAME'],row['DIVISION'],row['DATE'],row['TIME'],row['DESC'],row['ENTRIES'],row['EXITS']])
In [14]:
MTA_dict['N134','R385','00-00-00','ROCKAWAY BLVD'][0:3]
Out[14]:
Challenge 2
In [15]:
import dateutil
In [16]:
MTA_time = {}
for index, row in wk1.iterrows():
MTA_time.setdefault((row['C/A'],row['UNIT'],row['SCP'],row['STATION']),[]).append([dateutil.parser.parse(row['DATE'] + ' ' + row['TIME']),row['ENTRIES']])
In [29]:
MTA_time['N134','R385','00-00-00','ROCKAWAY BLVD'][0:3]
Out[29]:
Challenge 3
In [26]:
import datetime
MTA_date = {}
for key in MTA_time:
newList = []
for i in range(0,len(MTA_time[key]),6):
if i+5 < len(MTA_time[key]):
newList.append([MTA_time[key][i][0].date(), MTA_time[key][i+5][1] - MTA_time[key][i][1]])
MTA_date[key] = newList
In [27]:
MTA_date['N134','R385','00-00-00','ROCKAWAY BLVD'][0:3]
Out[27]:
Challenge 4
In [30]:
%matplotlib inline
import matplotlib.pyplot as plt
In [31]:
dates = [i[0] for i in MTA_date['N134','R385','00-00-00','ROCKAWAY BLVD']]
counts = [i[1] for i in MTA_date['N134','R385','00-00-00','ROCKAWAY BLVD']]
In [32]:
plt.figure(figsize=(10,3))
plt.plot(dates,counts)
Out[32]:
Challenge 5
In [162]:
MTA_date
Out[162]:
In [33]:
MTA_date_station = {}
for key in MTA_date:
MTA_date_station.setdefault((key[0],key[1],key[3]),[]).append([key[2],MTA_date[key]])
In [35]:
MTA_date_station['N134','R385','ROCKAWAY BLVD'][0]
Out[35]:
Challenge 6
In [36]:
MTA_date_station_ts = {}
for key in MTA_date_station:
ts = {}
for turnstile in MTA_date_station[key]:
for i in turnstile[1]:
ts.setdefault(i[0],0)
ts[i[0]] += i[1]
MTA_date_station_ts[key] = ts
In [40]:
MTA_date_station_ts['N134','R385','ROCKAWAY BLVD']
Out[40]:
Challenge 7
In [49]:
dates = sorted(MTA_date_station_ts['N134','R385','ROCKAWAY BLVD'].keys())
In [51]:
counts = []
for date in dates:
counts += [MTA_date_station_ts['N134','R385','ROCKAWAY BLVD'][date]]
In [52]:
plt.figure(figsize=(10,3))
plt.plot(dates,counts)
Out[52]:
Challenge 8
In [58]:
# Week 1
MTA_wk1 = MTA_date_station_ts
week1 = []
for key in MTA_wk1['N134','R385','ROCKAWAY BLVD']:
week1 += [MTA_wk1['N134','R385','ROCKAWAY BLVD'][key]]
# Week 2
MTA_time = {}
for index, row in wk2.iterrows():
MTA_time.setdefault((row['C/A'],row['UNIT'],row['SCP'],row['STATION']),[]).append([dateutil.parser.parse(row['DATE'] + ' ' + row['TIME']),row['ENTRIES']])
MTA_date = {}
for key in MTA_time:
newList = []
for i in range(0,len(MTA_time[key]),6):
if i+5 < len(MTA_time[key]):
newList.append([MTA_time[key][i][0].date(), MTA_time[key][i+5][1] - MTA_time[key][i][1]])
MTA_date[key] = newList
MTA_date_station = {}
for key in MTA_date:
MTA_date_station.setdefault((key[0],key[1],key[3]),[]).append([key[2],MTA_date[key]])
MTA_wk2 = {}
for key in MTA_date_station:
ts = {}
for turnstile in MTA_date_station[key]:
for i in turnstile[1]:
ts.setdefault(i[0],0)
ts[i[0]] += i[1]
MTA_wk2[key] = ts
week2 = []
for key in MTA_wk2['N134','R385','ROCKAWAY BLVD']:
week2 += [MTA_wk2['N134','R385','ROCKAWAY BLVD'][key]]
# Week 3
MTA_time = {}
for index, row in wk3.iterrows():
MTA_time.setdefault((row['C/A'],row['UNIT'],row['SCP'],row['STATION']),[]).append([dateutil.parser.parse(row['DATE'] + ' ' + row['TIME']),row['ENTRIES']])
MTA_date = {}
for key in MTA_time:
newList = []
for i in range(0,len(MTA_time[key]),6):
if i+5 < len(MTA_time[key]):
newList.append([MTA_time[key][i][0].date(), MTA_time[key][i+5][1] - MTA_time[key][i][1]])
MTA_date[key] = newList
MTA_date_station = {}
for key in MTA_date:
MTA_date_station.setdefault((key[0],key[1],key[3]),[]).append([key[2],MTA_date[key]])
MTA_wk3 = {}
for key in MTA_date_station:
ts = {}
for turnstile in MTA_date_station[key]:
for i in turnstile[1]:
ts.setdefault(i[0],0)
ts[i[0]] += i[1]
MTA_wk3[key] = ts
week3 = []
for key in MTA_wk3['N134','R385','ROCKAWAY BLVD']:
week3 += [MTA_wk3['N134','R385','ROCKAWAY BLVD'][key]]
# Week 4
MTA_time = {}
for index, row in wk4.iterrows():
MTA_time.setdefault((row['C/A'],row['UNIT'],row['SCP'],row['STATION']),[]).append([dateutil.parser.parse(row['DATE'] + ' ' + row['TIME']),row['ENTRIES']])
MTA_date = {}
for key in MTA_time:
newList = []
for i in range(0,len(MTA_time[key]),6):
if i+5 < len(MTA_time[key]):
newList.append([MTA_time[key][i][0].date(), MTA_time[key][i+5][1] - MTA_time[key][i][1]])
MTA_date[key] = newList
MTA_date_station = {}
for key in MTA_date:
MTA_date_station.setdefault((key[0],key[1],key[3]),[]).append([key[2],MTA_date[key]])
MTA_wk4 = {}
for key in MTA_date_station:
ts = {}
for turnstile in MTA_date_station[key]:
for i in turnstile[1]:
ts.setdefault(i[0],0)
ts[i[0]] += i[1]
MTA_wk4[key] = ts
week4 = []
for key in MTA_wk4['N134','R385','ROCKAWAY BLVD']:
week4 += [MTA_wk4['N134','R385','ROCKAWAY BLVD'][key]]
In [60]:
plt.plot(week1)
plt.plot(week2)
plt.plot(week3)
plt.plot(week4)
Out[60]:
Challenge 9
In [64]:
total_ridership = {}
for station in MTA_wk1:
total_ridership[station] = sum(MTA_wk1[station].values()) + sum(MTA_wk2[station].values()) + sum(MTA_wk3[station].values()) + sum(MTA_wk4[station].values())
Challenge 10
In [81]:
stations = total_ridership.keys()
In [82]:
counts = total_ridership.values()
In [118]:
total_ridership_counts = sorted(total_ridership.values())[19:-9] # remove errors: negative and large positive values
In [119]:
plt.hist(total_ridership_counts)
Out[119]: