In [1]:
import time
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gmaps
In [2]:
## GET SEGMENT DATA
def getData(filename):
df=pd.read_csv(filename,skipinitialspace=True)
return df
df_segment_hist=getData('Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Segment.csv')
df_segment_current=getData('Chicago_Traffic_Tracker_-_Congestion_Estimates_by_Segments.csv')
%matplotlib inline
In [3]:
df_segment_hist.head(2)
Out[3]:
In [4]:
import datetime as dt
## TRANSLATE TIME STAMP INTO DATE
format = '%m/%d/%Y %I:%M:%S %p'
df_segment_hist['DATE'] = df_segment_hist['TIME'].apply(lambda x:
dt.datetime.strptime(x,format).date())
df_segment_hist['WEEKDAY'] = df_segment_hist['DATE'].map(lambda x: x.weekday()) ## Sat and Sun are 5 and 6 respectively
df_segment_hist.head(2)
Out[4]:
In [5]:
## Filter out records with no GPS data and look at records with and without midnight timestamps
## df_segment_hist.columns
df_segment_gzero = df_segment_hist[(df_segment_hist['SPEED'] > 0)]
df_segment_gzero.head(2)
Out[5]:
In [22]:
print "max:", max(df_segment_gzero['DATE']), " min:", min(df_segment_gzero['DATE'])
print df_segment_gzero['DATE'].unique().shape
datacoverage = df_segment_gzero.groupby('DATE').SEGMENTID.nunique()
datacoverage.to_csv("SegmentDataCoverage.csv")
In [6]:
## Remove unnecessary columns (TIME, ID, BUS COUNT, MESSAGE COUNT)
if 'TIME' in df_segment_gzero.columns:
df_segment_gzero = df_segment_gzero.drop(['TIME'],1)
df_segment_gzero = df_segment_gzero.drop(['ID'],1)
df_segment_gzero = df_segment_gzero.drop(['BUS COUNT '],1)
df_segment_gzero = df_segment_gzero.drop(['MESSAGE COUNT'],1)
df_segment_gzero.head(5)
Out[6]:
In [7]:
## Remove unnecessary columns from Current file (DESCRIPTION, LAST_UPDATED)
if 'STREET' in df_segment_current.columns:
df_segment_current = df_segment_current.drop(['STREET'],1)
df_segment_current = df_segment_current.drop(['FROM_STREET'],1)
df_segment_current = df_segment_current.drop(['TO_STREET'],1)
df_segment_current = df_segment_current.drop(['LENGTH'],1)
df_segment_current = df_segment_current.drop(['STREET_HEADING'],1)
df_segment_current = df_segment_current.drop(['COMMENTS'],1)
df_segment_current = df_segment_current.drop(['CURRENT_SPEED'],1)
df_segment_current = df_segment_current.drop(['LAST_UPDATED'],1)
## Take average Lat Long for each region
df_segment_current['AvgLong'] = (df_segment_current['START_LONGITUDE']+df_segment_current['END_LONGITUDE'])/2
df_segment_current['AvgLat'] = (df_segment_current['START_LATITUDE']+df_segment_current['END_LATITUDE'])/2
df_segment_current.head(2)
#df_segment_current.shape
Out[7]:
In [8]:
## Group segment data by day and take average speed
import numpy as np
segmentid_groupby=df_segment_gzero.groupby(['SEGMENTID','DATE','WEEKDAY'], as_index=False)
segment_speed=segmentid_groupby.agg({"SPEED": [np.mean, np.std, np.min, np.max, np.median]})
#segment_speed.head()
df_segment_all = pd.merge(left=segment_speed,right=df_segment_current, how='left', left_on='SEGMENTID', right_on='SEGMENTID')
df_segment_all.columns = ['SEGMENTID', 'SEGMENTID2', 'DATE', 'WEEKDAY', 'SPEED', 'SPEED-SD','SPEED-MIN','SPEED-MAX','SPEED-MED','DIRECTION','START_LONGITUDE','START_LATITUDE','END_LONGITUDE','END_LATITUDE','AvgLong','AvgLat']
df_segment_all = df_segment_all[['SEGMENTID','SPEED', 'SPEED-SD','SPEED-MIN','SPEED-MAX','SPEED-MED','DATE','WEEKDAY','DIRECTION','AvgLong','AvgLat']]
df_segment_all.head()
Out[8]:
In [34]:
## Get Chicago Bulls data
from datetime import datetime
bulls_data=getData('BullsGameData.csv')
## Extract game's date from the raw data
format = '%m/%d/%Y'
bulls_data['DATE'] = bulls_data['Date'].apply(lambda x:
dt.datetime.strptime(x,format).date())
## Filter out null timestamps
bulls_data = bulls_data[pd.notnull(bulls_data['Time'])]
## Add GameType flag
bulls_data['GameType'] = 'Bulls'
## Take only required columns
bulls_data = bulls_data[['Date','Time','Result','OT','Lat','Long','DATE','GameType']]
bulls_data.head()
Out[34]:
In [57]:
## Get Chicago Bears game data
from datetime import datetime
bears_data=getData('BearsGameData.csv')
## Extract game's date from the raw data
format = '%m/%d/%Y'
bears_data['DATE'] = bears_data['Date'].apply(lambda x:
dt.datetime.strptime(x,format).date())
## Code in regular game time
bears_data['Time'] = '1:00p EST'
## Add GameType flag
bears_data['GameType'] = 'Bears'
## Take only required columns
bears_data = bears_data[['Date','Time','Result','OT','Lat','Long','DATE','GameType']]
bears_data.head()
Out[57]:
In [58]:
## Merge Bulls and Bears data
event_data = pd.concat([bears_data,bulls_data])
print "Bulls: ", bulls_data.shape
print "Bears: ", bears_data.shape
print "Merge: ", event_data.shape
In [59]:
from datetime import timedelta
##Extract game's start time from the raw data
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace('p', 'PM'))
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace(' EST', '')) #strptime doesn't work for EST
format = '%I:%M%p'
event_data['GameTime'] = event_data['Time'].apply(lambda x:
dt.datetime.strptime(x,format).time())
def addHours(tm, numhour, numminutes):
fulldate = datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
fulldate += timedelta(hours=numhour)
fulldate += timedelta(minutes=numminutes)
return fulldate.time()
## We need to convert EST to CST
event_data['GameTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
## Basketball game end time = 2:30 after start time with no OT. Each OT adds 15 minutes
## Add 1 hour pre- and post-game buffer
# Define game's start and end time
def overtime(row):
if row['OT'] == 'OT':
return addHours(row['EndTime'], 0, 15)
elif row['OT'] == '2OT':
return addHours(row['EndTime'], 0, 30)
elif row['OT'] == '3OT':
return addHours(row['EndTime'], 0, 45)
elif row['OT'] == '4OT':
return addHours(row['EndTime'], 1, 0)
else:
return row['EndTime']
event_data['StartTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data.apply(overtime, axis=1)
## Get rid of unnecessary columns
if 'Time' in event_data.columns:
event_data.drop('Time', axis=1, inplace=True)
event_data.drop('Date', axis=1, inplace=True)
event_data.head()
Out[59]:
In [60]:
df_segment_event=pd.merge(left=df_segment_all,right=event_data, how='left', left_on='DATE', right_on='DATE')
df_segment_event.head()
Out[60]:
In [61]:
##Add HasGame flag if game is happening during this time, and distance to the event
def hasgame(row):
if pd.isnull(row['StartTime']):
return 0
else:
return 1
import math
def eventdistance(row): ## calculates event distance in miles
if row['HasGame'] == 0:
return None
else:
distance = (3959 * math.acos(math.cos(math.radians(row['Lat'])) * math.cos(math.radians(row['AvgLat']))*math.cos(math.radians(row['AvgLong']) - math.radians(row['Long'])) + math.sin(math.radians(row['Lat'])) * math.sin(math.radians(row['AvgLat']))))
return distance
df_segment_event['HasGame'] = df_segment_event.apply(hasgame, axis=1)
df_segment_event['EventDistance'] = df_segment_event.apply(eventdistance, axis=1)
df_segment_event['POSTEDSPEED']=30 # default Posted Speed: same for all of Chicago
df_segment_event.head()
game_data = df_segment_event[(df_segment_event['HasGame'] == 1)]
game_data.head()
Out[61]:
In [62]:
# Export data set
df_segment_event.to_csv("df_segment_event_TimeSeries.csv")
In [ ]:
In [ ]:
In [ ]:
In [23]:
## GET REGIONAL DATA
def getData(filename):
df=pd.read_csv(filename,skipinitialspace=True)
return df
df_region_hist=getData('Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Region.csv')
df_region_current=getData('Chicago_Traffic_Tracker_-_Congestion_Estimates_by_Regions.csv')
%matplotlib inline
In [3]:
df_region_hist.head(2)
Out[3]:
In [4]:
df_region_hist.tail(2)
Out[4]:
In [24]:
import datetime as dt
format = '%m/%d/%Y %I:%M:%S %p'
df_region_hist['DATE'] = df_region_hist['TIME'].apply(lambda x:
dt.datetime.strptime(x,format).date())
df_region_hist['TIMESTAMP'] = df_region_hist['TIME'].apply(lambda x:
dt.datetime.strptime(x,format).time())
In [26]:
## Filter out data with a midnight timestamp because we know it is primarily duplicate values
df_region_nonmidnight = df_region_hist[(df_region_hist['TIMESTAMP'] < dt.time(23, 50, 00))]
print "max:", max(df_region_nonmidnight['DATE']), " min:", min(df_region_nonmidnight['DATE'])
print df_region_nonmidnight['DATE'].unique().shape
datacoveragereg = df_region_nonmidnight.groupby('DATE').REGION_ID.nunique()
datacoveragereg.to_csv("RegionDataCoverage.csv")
In [6]:
## Extract Hour and Minute, and round to nearest 10 minute
df_region_hist['HOUR'] = df_region_hist['TIMESTAMP'].map(lambda x: x.hour)
df_region_hist['MINUTE'] = df_region_hist['TIMESTAMP'].map(lambda x: x.minute - x.minute % 10)
df_region_hist['MONTH'] = df_region_hist['DATE'].map(lambda x: x.month)
df_region_hist['YEAR'] = df_region_hist['DATE'].map(lambda x: x.year)
df_region_hist['WEEKDAY'] = df_region_hist['DATE'].map(lambda x: x.weekday()) ## Sat and Sun are 5 and 6 respectively
def traffictime(row):
traffictimestamp = dt.datetime(100, 1, 1, row['HOUR'], row['MINUTE'], 0)
return traffictimestamp.time()
df_region_hist['TIMESTAMP'] = df_region_hist.apply(traffictime, axis=1)
df_region_hist.head(10)
Out[6]:
In [7]:
## Filter out data with no GPS data and look at records with and without midnight timestamps
df_region_hist = df_region_hist.rename(columns = {'NUMBER OF READS ':'NUMBER OF READS'})
df_region_hist = df_region_hist[(df_region_hist['NUMBER OF READS'] > 0)]
## Filter out data with a midnight timestamp because we know it is primarily duplicate values
df_region_nonmidnight = df_region_hist[(df_region_hist['TIMESTAMP'] < dt.time(23, 50, 00))]
## Filter for only January 2015
df_region_nonmidnight = df_region_nonmidnight[(df_region_nonmidnight['MONTH'] == 1) & (df_region_nonmidnight['YEAR'] == 2015)]
In [8]:
## Remove unnecessary columns (TIME, ID, BUS COUNT, NUMBER OF READS)
if 'TIME' in df_region_nonmidnight.columns:
df_region_nonmidnight = df_region_nonmidnight.drop(['TIME'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['ID'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['BUS COUNT'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['NUMBER OF READS'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['HOUR'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['MINUTE'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['YEAR'],1)
df_region_nonmidnight = df_region_nonmidnight.drop(['MONTH'],1)
df_region_nonmidnight.head(5)
#df_region_nonmidnight.shape
Out[8]:
In [9]:
## Remove unnecessary columns (DESCRIPTION, LAST_UPDATED)
if 'DESCRIPTION' in df_region_current.columns:
df_region_current = df_region_current.drop(['DESCRIPTION'],1)
df_region_current = df_region_current.drop(['LAST_UPDATED'],1)
## Take average Lat Long for each region
df_region_current['AvgLong'] = (df_region_current['WEST']+df_region_current['EAST'])/2
df_region_current['AvgLat'] = (df_region_current['NORTH']+df_region_current['SOUTH'])/2
df_region_current.head(2)
#df_region_current.shape
Out[9]:
In [10]:
df_region_all_Jan2015=pd.merge(left=df_region_nonmidnight,right=df_region_current, how='left', left_on='REGION_ID', right_on='REGION_ID')
df_region_all_Jan2015 = df_region_all_Jan2015[['REGION_ID','SPEED','TIMESTAMP','DATE','WEEKDAY','REGION','AvgLong','AvgLat']]
df_region_all_Jan2015.head(5)
Out[10]:
In [11]:
##### Merge with event data
from datetime import datetime
event_data=getData('BullsGameData.csv')
##Extract game's date from the raw data
format = '%m/%d/%Y'
event_data['DATE'] = event_data['Date'].apply(lambda x:
dt.datetime.strptime(x,format).date())
## Filter for only January 2015
event_data['MONTH'] = event_data['DATE'].map(lambda x: x.month)
event_data['YEAR'] = event_data['DATE'].map(lambda x: x.year)
event_data = event_data[(event_data['MONTH'] == 1) & (event_data['YEAR'] == 2015)]
## Filter out null timestamps
event_data = event_data[pd.notnull(event_data['Time'])]
event_data.head(25)
Out[11]:
In [12]:
##Extract game's start time from the raw data
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace('p', 'PM'))
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace(' EST', '')) #strptime doesn't work for EST
format = '%I:%M%p'
event_data['GameTime'] = event_data['Time'].apply(lambda x:
dt.datetime.strptime(x,format).time())
from datetime import timedelta
def addHours(tm, numhour, numminutes):
fulldate = datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
fulldate += timedelta(hours=numhour)
fulldate += timedelta(minutes=numminutes)
return fulldate.time()
## We need to convert EST to CST
event_data['GameTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
## Basketball game end time = 2:30 after start time with no OT. Each OT adds 15 minutes
## Add 1 hour pre- and post-game buffer
# Define game's start and end time
def overtime(row):
if row['OT'] == 'OT':
return addHours(row['EndTime'], 0, 15)
elif row['OT'] == '2OT':
return addHours(row['EndTime'], 0, 30)
elif row['OT'] == '3OT':
return addHours(row['EndTime'], 0, 45)
elif row['OT'] == '4OT':
return addHours(row['EndTime'], 1, 0)
else:
return row['EndTime']
event_data['StartTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data.apply(overtime, axis=1)
## Get rid of unnecessary columns
if 'Time' in event_data.columns:
event_data.drop('Time', axis=1, inplace=True)
event_data.drop('AwayFlag', axis=1, inplace=True)
event_data.drop('Date', axis=1, inplace=True)
event_data.drop('GameNum', axis=1, inplace=True)
event_data.drop('Opponent', axis=1, inplace=True)
event_data.drop('MONTH', axis=1, inplace=True)
event_data.drop('YEAR', axis=1, inplace=True)
event_data.head()
Out[12]:
In [13]:
df_region_Jan2015_event=pd.merge(left=df_region_all_Jan2015,right=event_data, how='left', left_on='DATE', right_on='DATE')
df_region_Jan2015_event.head()
Out[13]:
In [14]:
##Add HasGame flag if game is happening during this time, and distance to the event
def hasgame(row):
if pd.isnull(row['StartTime']):
return 0
elif (row['StartTime']<=row['TIMESTAMP']) & (row['EndTime']>=row['TIMESTAMP']):
return 1
else:
return 0
import math
def eventdistance(row): ## calculates event distance in miles
if row['HasGame'] == 0:
return None
else:
distance = (3959 * math.acos(math.cos(math.radians(row['Lat'])) * math.cos(math.radians(row['AvgLat']))*math.cos(math.radians(row['AvgLong']) - math.radians(row['Long'])) + math.sin(math.radians(row['Lat'])) * math.sin(math.radians(row['AvgLat']))))
return distance
df_region_Jan2015_event['HasGame'] = df_region_Jan2015_event.apply(hasgame, axis=1)
df_region_Jan2015_event['EventDistance'] = df_region_Jan2015_event.apply(eventdistance, axis=1)
df_region_Jan2015_event['POSTEDSPEED']=30 # default Posted Speed: same for all of Chicago
game_data = df_region_Jan2015_event[(df_region_Jan2015_event['DATE'] == dt.date(2015, 01, 03)) & (df_region_Jan2015_event['TIMESTAMP'] == dt.time(20, 30, 00))]
game_data.head()
Out[14]:
In [31]:
df_region_Jan2015_event.to_csv("Jan2015_TimeSeries.csv")
In [19]:
## Combine Date and Time into DateTime
def getdatetime(row):
return dt.datetime(row['DATE'].year, row['DATE'].month, row['DATE'].day, row['TIMESTAMP'].hour, row['TIMESTAMP'].minute, row['TIMESTAMP'].second)
## Filter for only one game day
df_formatted_data = df_region_Jan2015_event[(df_region_Jan2015_event['DATE'] == dt.date(2015, 01, 05))]
df_formatted_data['DateTime'] = df_formatted_data.apply(getdatetime, axis=1)
## Calculate the required fields
df_datasample = df_formatted_data[['DateTime','REGION_ID','SPEED']]
df_datasample.head(5)
Out[19]:
In [20]:
df_datasample.to_csv("DataSampleForUI.csv")
In [ ]: