In [79]:
from __future__ import division
import csv
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np 
import time

In [80]:
with open('turnstile_160319_mod.csv','rU') as f:    #open the link to the data
    reader = csv.reader(f)    #read in the data
    rows = [[cell.strip() for cell in row] for row in reader]    #loop over the rows and witin the rows to extract the data and remove any whitespace from the beginning and end

In [81]:
assert rows.pop(0) == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME',
                       'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
                       'EXITS']    #check to make sure that we have the correct variable names

In [82]:
raw_readings = {}    #an empty dictionary
for row in rows:    #loop over the elements in the list rows
    raw_readings.setdefault(tuple(row[:4]), []).append(tuple(row[4:]))    #loop over the rows, add new keys to the dictionary when the row info is not already present, append to existing data when it is

In [83]:
#raw_readings.keys()    #the dictionary raw_readings solves challenge 1

In [84]:
datetime_cumulative = {turnstile: [(datetime.strptime(date + time,'%m/%d/%y%H:%M:%S'),int(in_cumulative))
                                   for _, _, date, time,_, in_cumulative, _ in rows]
                       for turnstile, rows in raw_readings.items()}    #make a new dictionary; the keys will be the turnstiles, and each will have a list of tuples consisting of date and cumulative total

In [85]:
datetime_count_times = {turnstile: [[rows[i][0],
                                     rows[i+1][1] - rows[i][1],
                                     rows[i+1][0] - rows[i][0]]
                                    for i in range(len(rows) - 1)]
                        for turnstile, rows in datetime_cumulative.items()}    #make a new dictionary; keys will be the turnstiles, each will have a list of lists, each of which will give date, change in entries from last time, and elasped time

In [86]:
datetime_counts = {turnstile: [(time, count)
                               for (time, count, _) in rows
                               if 0 <= count <= 5000]
                   for turnstile, rows in datetime_count_times.items()}    #make one more dictionary; keys will be turnstiles, each will have a list of lists with date and change in entries

In [87]:
# Convert dictionary to list
the_list = list(datetime_counts.iteritems())

In [88]:
# Build set of times
times_list = []
stations_list =[]
traffic_list = []
for the_station in the_list:
    for values in the_station[1]:
        times_list.append(values[0])
        traffic_list.append(values[1])
        stations_list.append(the_station[0][3])

# Get day of the week
day_num_list = []
for date_time in times_list:
    day_num_list.append(date_time.weekday())

# Check to see if entries match up
for i in range(5):
    print stations_list[i]
    print times_list[i]
    print day_num_list[i]
    print traffic_list[i]


168 ST
2016-03-12 03:00:00
5
133
168 ST
2016-03-12 07:00:00
5
467
168 ST
2016-03-12 11:00:00
5
517
168 ST
2016-03-12 15:00:00
5
477
168 ST
2016-03-12 19:00:00
5
335

In [89]:
# Build dataframe from lists
df = pd.DataFrame({'Station': stations_list,\
                  'Datetime': times_list,\
                  'Day Number': day_num_list,\
                  'Traffic': traffic_list})
df.head()


Out[89]:
Datetime Day Number Station Traffic
0 2016-03-12 03:00:00 5 168 ST 133
1 2016-03-12 07:00:00 5 168 ST 467
2 2016-03-12 11:00:00 5 168 ST 517
3 2016-03-12 15:00:00 5 168 ST 477
4 2016-03-12 19:00:00 5 168 ST 335

In [90]:
# Top ten busiest stations at any given time
df_top_ten = df.sort(['Traffic'], ascending=False)[:10]
df_top_ten


/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app
Out[90]:
Datetime Day Number Station Traffic
97825 2016-03-16 12:00:00 2 34 ST-HERALD SQ 4983
56122 2016-03-17 08:00:00 3 PATH WTC 2 4743
56127 2016-03-18 05:00:00 4 PATH WTC 2 4470
56427 2016-03-14 16:00:00 0 42 ST-PORT AUTH 4073
140411 2016-03-15 16:00:00 1 86 ST 3999
169856 2016-03-17 09:00:00 3 GRD CNTRL-42 ST 3981
82071 2016-03-18 17:00:00 4 W 4 ST-WASH SQ 3966
134871 2016-03-15 16:00:00 1 86 ST 3961
165088 2016-03-14 16:00:00 0 42 ST-PORT AUTH 3956
56107 2016-03-14 09:00:00 0 PATH WTC 2 3918

In [91]:
# Top ten busiest stations
# Group by station, sum traffic over one week

df_station_traffic= df.groupby(['Station'])['Traffic'].sum()
df_station_traffic.sort_values(ascending=False)[:10] # Top ten busiest stations in the week


Out[91]:
Station
34 ST-PENN STA     1915644
GRD CNTRL-42 ST    1734933
34 ST-HERALD SQ    1375632
23 ST              1287165
14 ST-UNION SQ     1244204
86 ST              1226992
TIMES SQ-42 ST     1151716
42 ST-PORT AUTH     995479
FULTON ST           891354
59 ST               882741
Name: Traffic, dtype: int64

In [92]:
# Busiest times of the week
# Group by time of day
df_time_traffic= df.groupby(['Datetime'])['Traffic'].sum()
df_time_traffic.sort_values(ascending=False)[:10]


Out[92]:
Datetime
2016-03-15 16:00:00    1918310
2016-03-16 16:00:00    1917760
2016-03-17 16:00:00    1893481
2016-03-18 16:00:00    1871236
2016-03-14 16:00:00    1793062
2016-03-16 08:00:00    1653267
2016-03-15 08:00:00    1607212
2016-03-17 08:00:00    1578437
2016-03-18 08:00:00    1514451
2016-03-14 08:00:00    1469310
Name: Traffic, dtype: int64

In [93]:
# Busiest days of the week (in descending order of traffic)
# Group by weekday
df_day_of_wk_traffic= df.groupby(['Day Number'])['Traffic'].sum()
df_day_of_wk_traffic.sort_values(ascending=False)
#the_days = ['Mon', 'Tue','Wed','Thu','Fri','Sat','Sun']


Out[93]:
Day Number
3    10831769
2    10725979
1    10557245
0     9988195
4     9591498
5     6271657
6     4549419
Name: Traffic, dtype: int64

In [94]:
df.to_csv("output.csv")

In [ ]: