In [79]:
from __future__ import division
import csv
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np
import time
In [80]:
with open('turnstile_160319_mod.csv','rU') as f: #open the link to the data
reader = csv.reader(f) #read in the data
rows = [[cell.strip() for cell in row] for row in reader] #loop over the rows and witin the rows to extract the data and remove any whitespace from the beginning and end
In [81]:
assert rows.pop(0) == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME',
'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
'EXITS'] #check to make sure that we have the correct variable names
In [82]:
raw_readings = {} #an empty dictionary
for row in rows: #loop over the elements in the list rows
raw_readings.setdefault(tuple(row[:4]), []).append(tuple(row[4:])) #loop over the rows, add new keys to the dictionary when the row info is not already present, append to existing data when it is
In [83]:
#raw_readings.keys() #the dictionary raw_readings solves challenge 1
In [84]:
datetime_cumulative = {turnstile: [(datetime.strptime(date + time,'%m/%d/%y%H:%M:%S'),int(in_cumulative))
for _, _, date, time,_, in_cumulative, _ in rows]
for turnstile, rows in raw_readings.items()} #make a new dictionary; the keys will be the turnstiles, and each will have a list of tuples consisting of date and cumulative total
In [85]:
datetime_count_times = {turnstile: [[rows[i][0],
rows[i+1][1] - rows[i][1],
rows[i+1][0] - rows[i][0]]
for i in range(len(rows) - 1)]
for turnstile, rows in datetime_cumulative.items()} #make a new dictionary; keys will be the turnstiles, each will have a list of lists, each of which will give date, change in entries from last time, and elasped time
In [86]:
datetime_counts = {turnstile: [(time, count)
for (time, count, _) in rows
if 0 <= count <= 5000]
for turnstile, rows in datetime_count_times.items()} #make one more dictionary; keys will be turnstiles, each will have a list of lists with date and change in entries
In [87]:
# Convert dictionary to list
the_list = list(datetime_counts.iteritems())
In [88]:
# Build set of times
times_list = []
stations_list =[]
traffic_list = []
for the_station in the_list:
for values in the_station[1]:
times_list.append(values[0])
traffic_list.append(values[1])
stations_list.append(the_station[0][3])
# Get day of the week
day_num_list = []
for date_time in times_list:
day_num_list.append(date_time.weekday())
# Check to see if entries match up
for i in range(5):
print stations_list[i]
print times_list[i]
print day_num_list[i]
print traffic_list[i]
In [89]:
# Build dataframe from lists
df = pd.DataFrame({'Station': stations_list,\
'Datetime': times_list,\
'Day Number': day_num_list,\
'Traffic': traffic_list})
df.head()
Out[89]:
In [90]:
# Top ten busiest stations at any given time
df_top_ten = df.sort(['Traffic'], ascending=False)[:10]
df_top_ten
Out[90]:
In [91]:
# Top ten busiest stations
# Group by station, sum traffic over one week
df_station_traffic= df.groupby(['Station'])['Traffic'].sum()
df_station_traffic.sort_values(ascending=False)[:10] # Top ten busiest stations in the week
Out[91]:
In [92]:
# Busiest times of the week
# Group by time of day
df_time_traffic= df.groupby(['Datetime'])['Traffic'].sum()
df_time_traffic.sort_values(ascending=False)[:10]
Out[92]:
In [93]:
# Busiest days of the week (in descending order of traffic)
# Group by weekday
df_day_of_wk_traffic= df.groupby(['Day Number'])['Traffic'].sum()
df_day_of_wk_traffic.sort_values(ascending=False)
#the_days = ['Mon', 'Tue','Wed','Thu','Fri','Sat','Sun']
Out[93]:
In [94]:
df.to_csv("output.csv")
In [ ]: