notebook.community

Edit and run



In [79]:

    
from __future__ import division
import csv
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np 
import time



In [80]:

    
with open('turnstile_160319_mod.csv','rU') as f:    #open the link to the data
    reader = csv.reader(f)    #read in the data
    rows = [[cell.strip() for cell in row] for row in reader]    #loop over the rows and witin the rows to extract the data and remove any whitespace from the beginning and end



In [81]:

    
assert rows.pop(0) == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME',
                       'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
                       'EXITS']    #check to make sure that we have the correct variable names



In [82]:

    
raw_readings = {}    #an empty dictionary
for row in rows:    #loop over the elements in the list rows
    raw_readings.setdefault(tuple(row[:4]), []).append(tuple(row[4:]))    #loop over the rows, add new keys to the dictionary when the row info is not already present, append to existing data when it is



In [83]:

    
#raw_readings.keys()    #the dictionary raw_readings solves challenge 1



In [84]:

    
datetime_cumulative = {turnstile: [(datetime.strptime(date + time,'%m/%d/%y%H:%M:%S'),int(in_cumulative))
                                   for _, _, date, time,_, in_cumulative, _ in rows]
                       for turnstile, rows in raw_readings.items()}    #make a new dictionary; the keys will be the turnstiles, and each will have a list of tuples consisting of date and cumulative total



In [85]:

    
datetime_count_times = {turnstile: [[rows[i][0],
                                     rows[i+1][1] - rows[i][1],
                                     rows[i+1][0] - rows[i][0]]
                                    for i in range(len(rows) - 1)]
                        for turnstile, rows in datetime_cumulative.items()}    #make a new dictionary; keys will be the turnstiles, each will have a list of lists, each of which will give date, change in entries from last time, and elasped time



In [86]:

    
datetime_counts = {turnstile: [(time, count)
                               for (time, count, _) in rows
                               if 0 <= count <= 5000]
                   for turnstile, rows in datetime_count_times.items()}    #make one more dictionary; keys will be turnstiles, each will have a list of lists with date and change in entries



In [87]:

    
# Convert dictionary to list
the_list = list(datetime_counts.iteritems())



In [88]:

    
# Build set of times
times_list = []
stations_list =[]
traffic_list = []
for the_station in the_list:
    for values in the_station[1]:
        times_list.append(values[0])
        traffic_list.append(values[1])
        stations_list.append(the_station[0][3])

# Get day of the week
day_num_list = []
for date_time in times_list:
    day_num_list.append(date_time.weekday())

# Check to see if entries match up
for i in range(5):
    print stations_list[i]
    print times_list[i]
    print day_num_list[i]
    print traffic_list[i]









    



168 ST
2016-03-12 03:00:00
5
133
168 ST
2016-03-12 07:00:00
5
467
168 ST
2016-03-12 11:00:00
5
517
168 ST
2016-03-12 15:00:00
5
477
168 ST
2016-03-12 19:00:00
5
335



In [89]:

    
# Build dataframe from lists
df = pd.DataFrame({'Station': stations_list,\
                  'Datetime': times_list,\
                  'Day Number': day_num_list,\
                  'Traffic': traffic_list})
df.head()









    Out[89]:






  
    
      
      Datetime
      Day Number
      Station
      Traffic
    
  
  
    
      0
      2016-03-12 03:00:00
      5
      168 ST
      133
    
    
      1
      2016-03-12 07:00:00
      5
      168 ST
      467
    
    
      2
      2016-03-12 11:00:00
      5
      168 ST
      517
    
    
      3
      2016-03-12 15:00:00
      5
      168 ST
      477
    
    
      4
      2016-03-12 19:00:00
      5
      168 ST
      335



In [90]:

    
# Top ten busiest stations at any given time
df_top_ten = df.sort(['Traffic'], ascending=False)[:10]
df_top_ten









    



/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app






    Out[90]:






  
    
      
      Datetime
      Day Number
      Station
      Traffic
    
  
  
    
      97825
      2016-03-16 12:00:00
      2
      34 ST-HERALD SQ
      4983
    
    
      56122
      2016-03-17 08:00:00
      3
      PATH WTC 2
      4743
    
    
      56127
      2016-03-18 05:00:00
      4
      PATH WTC 2
      4470
    
    
      56427
      2016-03-14 16:00:00
      0
      42 ST-PORT AUTH
      4073
    
    
      140411
      2016-03-15 16:00:00
      1
      86 ST
      3999
    
    
      169856
      2016-03-17 09:00:00
      3
      GRD CNTRL-42 ST
      3981
    
    
      82071
      2016-03-18 17:00:00
      4
      W 4 ST-WASH SQ
      3966
    
    
      134871
      2016-03-15 16:00:00
      1
      86 ST
      3961
    
    
      165088
      2016-03-14 16:00:00
      0
      42 ST-PORT AUTH
      3956
    
    
      56107
      2016-03-14 09:00:00
      0
      PATH WTC 2
      3918



In [91]:

    
# Top ten busiest stations
# Group by station, sum traffic over one week

df_station_traffic= df.groupby(['Station'])['Traffic'].sum()
df_station_traffic.sort_values(ascending=False)[:10] # Top ten busiest stations in the week









    Out[91]:





Station
34 ST-PENN STA     1915644
GRD CNTRL-42 ST    1734933
34 ST-HERALD SQ    1375632
23 ST              1287165
14 ST-UNION SQ     1244204
86 ST              1226992
TIMES SQ-42 ST     1151716
42 ST-PORT AUTH     995479
FULTON ST           891354
59 ST               882741
Name: Traffic, dtype: int64



In [92]:

    
# Busiest times of the week
# Group by time of day
df_time_traffic= df.groupby(['Datetime'])['Traffic'].sum()
df_time_traffic.sort_values(ascending=False)[:10]









    Out[92]:





Datetime
2016-03-15 16:00:00    1918310
2016-03-16 16:00:00    1917760
2016-03-17 16:00:00    1893481
2016-03-18 16:00:00    1871236
2016-03-14 16:00:00    1793062
2016-03-16 08:00:00    1653267
2016-03-15 08:00:00    1607212
2016-03-17 08:00:00    1578437
2016-03-18 08:00:00    1514451
2016-03-14 08:00:00    1469310
Name: Traffic, dtype: int64



In [93]:

    
# Busiest days of the week (in descending order of traffic)
# Group by weekday
df_day_of_wk_traffic= df.groupby(['Day Number'])['Traffic'].sum()
df_day_of_wk_traffic.sort_values(ascending=False)
#the_days = ['Mon', 'Tue','Wed','Thu','Fri','Sat','Sun']









    Out[93]:





Day Number
3    10831769
2    10725979
1    10557245
0     9988195
4     9591498
5     6271657
6     4549419
Name: Traffic, dtype: int64



In [94]:

    
df.to_csv("output.csv")



In [ ]:

	Datetime	Day Number	Station	Traffic
0	2016-03-12 03:00:00	5	168 ST	133
1	2016-03-12 07:00:00	5	168 ST	467
2	2016-03-12 11:00:00	5	168 ST	517
3	2016-03-12 15:00:00	5	168 ST	477
4	2016-03-12 19:00:00	5	168 ST	335

	Datetime	Day Number	Station	Traffic
97825	2016-03-16 12:00:00	2	34 ST-HERALD SQ	4983
56122	2016-03-17 08:00:00	3	PATH WTC 2	4743
56127	2016-03-18 05:00:00	4	PATH WTC 2	4470
56427	2016-03-14 16:00:00	0	42 ST-PORT AUTH	4073
140411	2016-03-15 16:00:00	1	86 ST	3999
169856	2016-03-17 09:00:00	3	GRD CNTRL-42 ST	3981
82071	2016-03-18 17:00:00	4	W 4 ST-WASH SQ	3966
134871	2016-03-15 16:00:00	1	86 ST	3961
165088	2016-03-14 16:00:00	0	42 ST-PORT AUTH	3956
56107	2016-03-14 09:00:00	0	PATH WTC 2	3918