In [2]:
import time
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gmaps
In [2]:
## GET REGIONAL DATA
def getData(filename):
df=pd.read_csv(filename,skipinitialspace=True)
return df
df_region_hist=getData('Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Region.csv')
df_region_current=getData('Chicago_Traffic_Tracker_-_Congestion_Estimates_by_Regions.csv')
%matplotlib inline
In [3]:
df_region_hist.head(2)
Out[3]:
In [4]:
df_region_hist.tail(2)
Out[4]:
In [5]:
import datetime as dt
format = '%m/%d/%Y %I:%M:%S %p'
df_region_hist['DATE'] = df_region_hist['TIME'].apply(lambda x:
dt.datetime.strptime(x,format).date())
df_region_hist['TIMESTAMP'] = df_region_hist['TIME'].apply(lambda x:
dt.datetime.strptime(x,format).time())
In [7]:
## Extract Hour and Minute, and round to nearest 10 minute
df_region_hist['HOUR'] = df_region_hist['TIMESTAMP'].map(lambda x: x.hour)
df_region_hist['MINUTE'] = df_region_hist['TIMESTAMP'].map(lambda x: x.minute - x.minute % 10)
df_region_hist['MONTH'] = df_region_hist['DATE'].map(lambda x: x.month)
df_region_hist['YEAR'] = df_region_hist['DATE'].map(lambda x: x.year)
df_region_hist['WEEKDAY'] = df_region_hist['DATE'].map(lambda x: x.weekday()) ## Sat and Sun are 5 and 6 respectively
def traffictime(row):
traffictimestamp = dt.datetime(100, 1, 1, row['HOUR'], row['MINUTE'], 0)
return traffictimestamp.time()
df_region_hist['TIMESTAMP'] = df_region_hist.apply(traffictime, axis=1)
df_region_hist.head(10)
Out[7]:
In [10]:
## Filter out data with no GPS data and look at records with and without midnight timestamps
df_region_hist = df_region_hist.rename(columns = {'NUMBER OF READS ':'NUMBER OF READS'})
df_region_hist = df_region_hist[(df_region_hist['NUMBER OF READS'] > 0)]
## Filter out data with a midnight timestamp because we know it is primarily duplicate values
df_region_nonmidnight = df_region_hist[(df_region_hist['TIMESTAMP'] < dt.time(23, 50, 00))]
## Filter for only January 2015
df_region_nonmidnight = df_region_nonmidnight[(df_region_nonmidnight['MONTH'] == 1) & (df_region_nonmidnight['YEAR'] == 2015)]
In [11]:
## Remove unnecessary columns (TIME, ID, BUS COUNT, NUMBER OF READS)
if 'TIME' in df_region_nonmidnight.columns:
df_region_nonmidnight = df_region_nonmidnight.drop(['TIME'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['ID'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['BUS COUNT'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['NUMBER OF READS'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['HOUR'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['MINUTE'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['YEAR'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['MONTH'],1)
df_region_nonmidnight.head(5)
#df_region_nonmidnight.shape
Out[11]:
In [12]:
## Remove unnecessary columns (DESCRIPTION, LAST_UPDATED)
if 'DESCRIPTION' in df_region_current.columns:
df_region_current = df_region_current.drop(['DESCRIPTION'],1)
df_region_current = df_region_current.drop(['LAST_UPDATED'],1)
## Take average Lat Long for each region
df_region_current['AvgLong'] = (df_region_current['WEST']+df_region_current['EAST'])/2
df_region_current['AvgLat'] = (df_region_current['NORTH']+df_region_current['SOUTH'])/2
df_region_current.head(2)
#df_region_current.shape
Out[12]:
In [13]:
df_region_all_Jan2015=pd.merge(left=df_region_nonmidnight,right=df_region_current, how='left', left_on='REGION_ID', right_on='REGION_ID')
df_region_all_Jan2015 = df_region_all_Jan2015[['REGION_ID','SPEED','TIMESTAMP','DATE','WEEKDAY','REGION','AvgLong','AvgLat']]
df_region_all_Jan2015.head(5)
Out[13]:
In [27]:
##### Merge with event data
from datetime import datetime
event_data=getData('BullsGameData.csv')
##Extract game's date from the raw data
format = '%m/%d/%Y'
event_data['DATE'] = event_data['Date'].apply(lambda x:
dt.datetime.strptime(x,format).date())
## Filter for only January 2015
event_data['MONTH'] = event_data['DATE'].map(lambda x: x.month)
event_data['YEAR'] = event_data['DATE'].map(lambda x: x.year)
event_data = event_data[(event_data['MONTH'] == 1) & (event_data['YEAR'] == 2015)]
## Filter out null timestamps
event_data = event_data[pd.notnull(event_data['Time'])]
event_data.head(25)
Out[27]:
In [28]:
##Extract game's start time from the raw data
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace('p', 'PM'))
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace(' EST', '')) #strptime doesn't work for EST
format = '%I:%M%p'
event_data['GameTime'] = event_data['Time'].apply(lambda x:
dt.datetime.strptime(x,format).time())
from datetime import timedelta
def addHours(tm, numhour, numminutes):
fulldate = datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
fulldate += timedelta(hours=numhour)
fulldate += timedelta(minutes=numminutes)
return fulldate.time()
## We need to convert EST to CST
event_data['GameTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
## Basketball game end time = 2:30 after start time with no OT. Each OT adds 15 minutes
## Add 1 hour pre- and post-game buffer
# Define game's start and end time
def overtime(row):
if row['OT'] == 'OT':
return addHours(row['EndTime'], 0, 15)
elif row['OT'] == '2OT':
return addHours(row['EndTime'], 0, 30)
elif row['OT'] == '3OT':
return addHours(row['EndTime'], 0, 45)
elif row['OT'] == '4OT':
return addHours(row['EndTime'], 1, 0)
else:
return row['EndTime']
event_data['StartTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data.apply(overtime, axis=1)
## Get rid of unnecessary columns
if 'Time' in event_data.columns:
event_data.drop('Time', axis=1, inplace=True)
event_data.drop('AwayFlag', axis=1, inplace=True)
event_data.drop('Date', axis=1, inplace=True)
event_data.drop('GameNum', axis=1, inplace=True)
event_data.drop('Opponent', axis=1, inplace=True)
event_data.drop('MONTH', axis=1, inplace=True)
event_data.drop('YEAR', axis=1, inplace=True)
event_data.head()
Out[28]:
In [29]:
df_region_Jan2015_event=pd.merge(left=df_region_all_Jan2015,right=event_data, how='left', left_on='DATE', right_on='DATE')
df_region_Jan2015_event.head()
Out[29]:
In [30]:
##Add HasGame flag if game is happening during this time, and distance to the event
def hasgame(row):
if pd.isnull(row['StartTime']):
return 0
elif (row['StartTime']<=row['TIMESTAMP']) & (row['EndTime']>=row['TIMESTAMP']):
return 1
else:
return 0
import math
def eventdistance(row): ## calculates event distance in miles
if row['HasGame'] == 0:
return None
else:
distance = (3959 * math.acos(math.cos(math.radians(row['Lat'])) * math.cos(math.radians(row['AvgLat']))*math.cos(math.radians(row['AvgLong']) - math.radians(row['Long'])) + math.sin(math.radians(row['Lat'])) * math.sin(math.radians(row['AvgLat']))))
return distance
df_region_Jan2015_event['HasGame'] = df_region_Jan2015_event.apply(hasgame, axis=1)
df_region_Jan2015_event['EventDistance'] = df_region_Jan2015_event.apply(eventdistance, axis=1)
df_region_Jan2015_event['POSTEDSPEED']=30 # default Posted Speed: same for all of Chicago
game_data = df_region_Jan2015_event[(df_region_Jan2015_event['DATE'] == dt.date(2015, 01, 03)) & (df_region_Jan2015_event['TIMESTAMP'] == dt.time(20, 30, 00))]
game_data.head()
Out[30]:
In [31]:
df_region_Jan2015_event.to_csv("Jan2015_TimeSeries.csv")
In [4]:
def getData(filename):
df=pd.read_csv(filename,skipinitialspace=True)
return df
df_formatted_data=getData('Jan2015_TimeSeries.csv')
df_formatted_data.head(5)
Out[4]:
In [8]:
## Combine Date and Time into DateTime
import datetime as dt
def getdatetime(row):
return datetime(row['DATE'].year, row['DATE'].month, row['DATE'].day, row['TIMESTAMP'].hour, row['TIMESTAMP'].minute, row['TIMESTAMP'].second)
## Filter for only one game day
df_formatted_data = df_formatted_data[(df_formatted_data['DATE'] == dt.date(2015, 01, 03))]
## Calculate the required fields
df_datasample = df_formatted_data[['TIMESTAMP','REGION_ID','SPEED']]
df_datasample['Congestion'] = df_datasample['SPEED'].map(lambda x: 30 - x)
df_datasample['DateTime'] = df_datasample.apply(getdatetime, axis=1)
## Get rid of unnecessary columns
if 'SPEED' in df_datasample.columns:
df_datasample.drop('SPEED', axis=1, inplace=True)
df_datasample.head(5)
In [ ]: