Getting Chicago Data


In [2]:
import time
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gmaps


:0: FutureWarning: IPython widgets are experimental and may change in the future.

Historical Data by Region


In [2]:
## GET REGIONAL DATA

def getData(filename):
    df=pd.read_csv(filename,skipinitialspace=True)
    return df

df_region_hist=getData('Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Region.csv')

df_region_current=getData('Chicago_Traffic_Tracker_-_Congestion_Estimates_by_Regions.csv')

%matplotlib inline

In [3]:
df_region_hist.head(2)


Out[3]:
TIME REGION_ID BUS COUNT NUMBER OF READS SPEED ID
0 01/31/2015 11:50:26 PM 1 9 135 26.59 bf2f4db8f620658c9a084918c964883479c594f9
1 01/31/2015 11:50:26 PM 1 19 257 21.14 75455d6eb934e4ef4ba965e3a3f913e5ba1c16fe

In [4]:
df_region_hist.tail(2)


Out[4]:
TIME REGION_ID BUS COUNT NUMBER OF READS SPEED ID
3179964 01/18/2013 11:50:26 PM 29 1 14 0.00 90c77f407f063ac107dc3396e429e9da4a080944
3179965 01/18/2013 11:50:26 PM 29 97 880 23.18 6ed9793b50e023386e7ef33effeb8a0a5fcb870b

In [5]:
import datetime as dt
format = '%m/%d/%Y %I:%M:%S %p'
df_region_hist['DATE'] = df_region_hist['TIME'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).date())

df_region_hist['TIMESTAMP'] = df_region_hist['TIME'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).time())

In [7]:
## Extract Hour and Minute, and round to nearest 10 minute
df_region_hist['HOUR'] = df_region_hist['TIMESTAMP'].map(lambda x: x.hour)
df_region_hist['MINUTE'] = df_region_hist['TIMESTAMP'].map(lambda x: x.minute - x.minute % 10)
df_region_hist['MONTH'] = df_region_hist['DATE'].map(lambda x: x.month)
df_region_hist['YEAR'] = df_region_hist['DATE'].map(lambda x: x.year)
df_region_hist['WEEKDAY'] = df_region_hist['DATE'].map(lambda x: x.weekday()) ## Sat and Sun are 5 and 6 respectively

def traffictime(row):
    traffictimestamp = dt.datetime(100, 1, 1, row['HOUR'], row['MINUTE'], 0)
    return traffictimestamp.time()
    
df_region_hist['TIMESTAMP'] = df_region_hist.apply(traffictime, axis=1)


df_region_hist.head(10)


Out[7]:
TIME REGION_ID BUS COUNT NUMBER OF READS SPEED ID DATE TIMESTAMP HOUR MINUTE MONTH YEAR WEEKDAY
0 01/31/2015 11:50:26 PM 1 9 135 26.59 bf2f4db8f620658c9a084918c964883479c594f9 2015-01-31 23:50:00 23 50 1 2015 5
1 01/31/2015 11:50:26 PM 1 19 257 21.14 75455d6eb934e4ef4ba965e3a3f913e5ba1c16fe 2015-01-31 23:50:00 23 50 1 2015 5
2 01/31/2015 11:50:26 PM 1 29 444 20.45 46044a944c48f049a9d909f1d7df5c772f10b8e3 2015-01-31 23:50:00 23 50 1 2015 5
3 01/31/2015 11:50:26 PM 1 21 314 22.50 e8f27a9bafc8eb535a059d689f6060602228c5d3 2015-01-31 23:50:00 23 50 1 2015 5
4 01/31/2015 11:50:26 PM 1 23 403 21.00 57bcf466f8ad54753b84f678f95a514700df0868 2015-01-31 23:50:00 23 50 1 2015 5
5 01/31/2015 11:50:26 PM 1 20 292 19.77 c48fb82bf84357e94c601c477dfb5cc627f93cee 2015-01-31 23:50:00 23 50 1 2015 5
6 01/31/2015 11:50:26 PM 1 4 41 0.00 4c21e0259230596f2e9f8c73ecefba5e18be4939 2015-01-31 23:50:00 23 50 1 2015 5
7 01/31/2015 11:50:26 PM 1 7 93 25.91 550563ab7a2c3050e53754191fe93c14d91c71b0 2015-01-31 23:50:00 23 50 1 2015 5
8 01/31/2015 11:50:26 PM 1 31 410 21.14 cd6b1fa1f3957103bdf93e8190b0f4c453c1a877 2015-01-31 23:50:00 23 50 1 2015 5
9 01/31/2015 11:50:26 PM 1 17 277 19.36 0a9e97fafc9bcd4d40b9f100240ac43301fb9396 2015-01-31 23:50:00 23 50 1 2015 5

In [10]:
## Filter out data with no GPS data and look at records with and without midnight timestamps
df_region_hist = df_region_hist.rename(columns = {'NUMBER OF READS                      ':'NUMBER OF READS'})
df_region_hist = df_region_hist[(df_region_hist['NUMBER OF READS'] > 0)]


## Filter out data with a midnight timestamp because we know it is primarily duplicate values
df_region_nonmidnight = df_region_hist[(df_region_hist['TIMESTAMP'] < dt.time(23, 50, 00))]

## Filter for only January 2015
df_region_nonmidnight = df_region_nonmidnight[(df_region_nonmidnight['MONTH'] == 1) & (df_region_nonmidnight['YEAR'] == 2015)]

In [11]:
## Remove unnecessary columns (TIME, ID, BUS COUNT, NUMBER OF READS)
if 'TIME' in df_region_nonmidnight.columns:
    df_region_nonmidnight = df_region_nonmidnight.drop(['TIME'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['ID'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['BUS COUNT'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['NUMBER OF READS'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['HOUR'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['MINUTE'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['YEAR'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['MONTH'],1)
    
df_region_nonmidnight.head(5)
#df_region_nonmidnight.shape


Out[11]:
REGION_ID SPEED DATE TIMESTAMP WEEKDAY
4205 1 21.48 2015-01-31 23:40:00 5
4206 2 22.23 2015-01-31 23:40:00 5
4207 3 20.45 2015-01-31 23:40:00 5
4208 4 21.07 2015-01-31 23:40:00 5
4209 5 23.86 2015-01-31 23:40:00 5

Join with current data to get location


In [12]:
## Remove unnecessary columns (DESCRIPTION, LAST_UPDATED)
if 'DESCRIPTION' in df_region_current.columns:
    df_region_current = df_region_current.drop(['DESCRIPTION'],1)
    df_region_current = df_region_current.drop(['LAST_UPDATED'],1)
    
## Take average Lat Long for each region
df_region_current['AvgLong'] = (df_region_current['WEST']+df_region_current['EAST'])/2
df_region_current['AvgLat'] = (df_region_current['NORTH']+df_region_current['SOUTH'])/2

df_region_current.head(2)
#df_region_current.shape


Out[12]:
REGION REGION_ID WEST EAST SOUTH NORTH CURRENT_SPEED AvgLong AvgLat
0 Rogers Park - West Ridge 1 -87.709645 -87.654561 41.997946 42.026444 25.23 -87.682103 42.012195
1 Far North West 2 -87.846210 -87.747456 41.960669 42.019100 30.68 -87.796833 41.989884

In [13]:
df_region_all_Jan2015=pd.merge(left=df_region_nonmidnight,right=df_region_current, how='left', left_on='REGION_ID', right_on='REGION_ID')
df_region_all_Jan2015 = df_region_all_Jan2015[['REGION_ID','SPEED','TIMESTAMP','DATE','WEEKDAY','REGION','AvgLong','AvgLat']]
df_region_all_Jan2015.head(5)


Out[13]:
REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat
0 1 21.48 23:40:00 2015-01-31 5 Rogers Park - West Ridge -87.682103 42.012195
1 2 22.23 23:40:00 2015-01-31 5 Far North West -87.796833 41.989884
2 3 20.45 23:40:00 2015-01-31 5 North Park-Albany-Linconl Sq -87.711023 41.979308
3 4 21.07 23:40:00 2015-01-31 5 Edge Water-Uptown -87.660514 41.979308
4 5 23.86 23:40:00 2015-01-31 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969

Load Chicago Bulls Game Data


In [27]:
##### Merge with event data
from datetime import datetime
event_data=getData('BullsGameData.csv')

##Extract game's date from the raw data
format = '%m/%d/%Y'
event_data['DATE'] = event_data['Date'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).date())

## Filter for only January 2015
event_data['MONTH'] = event_data['DATE'].map(lambda x: x.month)
event_data['YEAR'] = event_data['DATE'].map(lambda x: x.year)
event_data = event_data[(event_data['MONTH'] == 1) & (event_data['YEAR'] == 2015)]

## Filter out null timestamps
event_data = event_data[pd.notnull(event_data['Time'])]


event_data.head(25)


Out[27]:
GameNum Date Time AwayFlag Opponent Result OT Lat Long DATE MONTH YEAR
55 33 1/1/2015 8:00p EST NaN Denver Nuggets W NaN 41.880556 -87.674167 2015-01-01 1 2015
56 34 1/3/2015 8:00p EST NaN Boston Celtics W OT 41.880556 -87.674167 2015-01-03 1 2015
57 35 1/5/2015 8:00p EST NaN Houston Rockets W NaN 41.880556 -87.674167 2015-01-05 1 2015
58 36 1/7/2015 8:00p EST NaN Utah Jazz L NaN 41.880556 -87.674167 2015-01-07 1 2015
59 38 1/10/2015 8:00p EST NaN Milwaukee Bucks W NaN 41.880556 -87.674167 2015-01-10 1 2015
60 39 1/12/2015 8:00p EST NaN Orlando Magic L NaN 41.880556 -87.674167 2015-01-12 1 2015
61 40 1/14/2015 8:00p EST NaN Washington Wizards L NaN 41.880556 -87.674167 2015-01-14 1 2015
62 42 1/17/2015 8:00p EST NaN Atlanta Hawks L NaN 41.880556 -87.674167 2015-01-17 1 2015
63 44 1/22/2015 8:00p EST NaN San Antonio Spurs W NaN 41.880556 -87.674167 2015-01-22 1 2015
64 46 1/25/2015 1:00p EST NaN Miami Heat L NaN 41.880556 -87.674167 2015-01-25 1 2015

In [28]:
##Extract game's start time from the raw data
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace('p', 'PM'))
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace(' EST', '')) #strptime doesn't work for EST

format = '%I:%M%p'
event_data['GameTime'] = event_data['Time'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).time())



from datetime import timedelta
def addHours(tm, numhour, numminutes):
    fulldate = datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
    fulldate += timedelta(hours=numhour)
    fulldate += timedelta(minutes=numminutes)
    return fulldate.time()


## We need to convert EST to CST
event_data['GameTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))


## Basketball game end time = 2:30 after start time with no OT. Each OT adds 15 minutes
## Add 1 hour pre- and post-game buffer
# Define game's start and end time

def overtime(row):
    if row['OT'] == 'OT':
        return addHours(row['EndTime'], 0, 15)
    elif row['OT'] == '2OT':
        return addHours(row['EndTime'], 0, 30)
    elif row['OT'] == '3OT':
        return addHours(row['EndTime'], 0, 45)
    elif row['OT'] == '4OT':
        return addHours(row['EndTime'], 1, 0)
    else:
        return row['EndTime']

event_data['StartTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data.apply(overtime, axis=1)
    
    
## Get rid of unnecessary columns
if 'Time' in event_data.columns:
    event_data.drop('Time', axis=1, inplace=True)
    event_data.drop('AwayFlag', axis=1, inplace=True)
    event_data.drop('Date', axis=1, inplace=True)
    event_data.drop('GameNum', axis=1, inplace=True)
    event_data.drop('Opponent', axis=1, inplace=True)
    event_data.drop('MONTH', axis=1, inplace=True)
    event_data.drop('YEAR', axis=1, inplace=True)
    
    
event_data.head()


Out[28]:
Result OT Lat Long DATE GameTime StartTime EndTime
55 W NaN 41.880556 -87.674167 2015-01-01 19:00:00 18:00:00 21:30:00
56 W OT 41.880556 -87.674167 2015-01-03 19:00:00 18:00:00 21:45:00
57 W NaN 41.880556 -87.674167 2015-01-05 19:00:00 18:00:00 21:30:00
58 L NaN 41.880556 -87.674167 2015-01-07 19:00:00 18:00:00 21:30:00
59 W NaN 41.880556 -87.674167 2015-01-10 19:00:00 18:00:00 21:30:00

Merge Traffic with Event Data


In [29]:
df_region_Jan2015_event=pd.merge(left=df_region_all_Jan2015,right=event_data, how='left', left_on='DATE', right_on='DATE')
df_region_Jan2015_event.head()


Out[29]:
REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat Result OT Lat Long GameTime StartTime EndTime
0 1 21.48 23:40:00 2015-01-31 5 Rogers Park - West Ridge -87.682103 42.012195 NaN NaN NaN NaN NaN NaN NaN
1 2 22.23 23:40:00 2015-01-31 5 Far North West -87.796833 41.989884 NaN NaN NaN NaN NaN NaN NaN
2 3 20.45 23:40:00 2015-01-31 5 North Park-Albany-Linconl Sq -87.711023 41.979308 NaN NaN NaN NaN NaN NaN NaN
3 4 21.07 23:40:00 2015-01-31 5 Edge Water-Uptown -87.660514 41.979308 NaN NaN NaN NaN NaN NaN NaN
4 5 23.86 23:40:00 2015-01-31 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969 NaN NaN NaN NaN NaN NaN NaN

In [30]:
##Add HasGame flag if game is happening during this time, and distance to the event

def hasgame(row):
    if pd.isnull(row['StartTime']):
        return 0
    elif (row['StartTime']<=row['TIMESTAMP']) & (row['EndTime']>=row['TIMESTAMP']):
        return 1
    else:
        return 0
    
import math
def eventdistance(row):    ## calculates event distance in miles
    if row['HasGame'] == 0:
        return None
    else:
        distance = (3959 * math.acos(math.cos(math.radians(row['Lat'])) * math.cos(math.radians(row['AvgLat']))*math.cos(math.radians(row['AvgLong']) - math.radians(row['Long'])) + math.sin(math.radians(row['Lat'])) * math.sin(math.radians(row['AvgLat'])))) 
        return distance
        
df_region_Jan2015_event['HasGame'] = df_region_Jan2015_event.apply(hasgame, axis=1)
df_region_Jan2015_event['EventDistance'] = df_region_Jan2015_event.apply(eventdistance, axis=1)
df_region_Jan2015_event['POSTEDSPEED']=30  # default Posted Speed: same for all of Chicago

game_data = df_region_Jan2015_event[(df_region_Jan2015_event['DATE'] == dt.date(2015, 01, 03)) & (df_region_Jan2015_event['TIMESTAMP'] == dt.time(20, 30, 00))]
game_data.head()


Out[30]:
REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat Result OT Lat Long GameTime StartTime EndTime HasGame EventDistance POSTEDSPEED
116307 1 19.77 20:30:00 2015-01-03 5 Rogers Park - West Ridge -87.682103 42.012195 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 9.105076 30
116308 2 25.91 20:30:00 2015-01-03 5 Far North West -87.796833 41.989884 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 9.839918 30
116309 3 23.18 20:30:00 2015-01-03 5 North Park-Albany-Linconl Sq -87.711023 41.979308 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 7.081638 30
116310 4 20.45 20:30:00 2015-01-03 5 Edge Water-Uptown -87.660514 41.979308 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 6.859490 30
116311 5 23.86 20:30:00 2015-01-03 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 7.343467 30

Export Data Set


In [31]:
df_region_Jan2015_event.to_csv("Jan2015_TimeSeries.csv")

Create sample data for John


In [4]:
def getData(filename):
    df=pd.read_csv(filename,skipinitialspace=True)
    return df

df_formatted_data=getData('Jan2015_TimeSeries.csv')

df_formatted_data.head(5)


Out[4]:
Unnamed: 0 REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat Result OT Lat Long GameTime StartTime EndTime HasGame EventDistance POSTEDSPEED
0 0 1 21.48 23:40:00 2015-01-31 5 Rogers Park - West Ridge -87.682103 42.012195 NaN NaN NaN NaN NaN NaN NaN 0 NaN 30
1 1 2 22.23 23:40:00 2015-01-31 5 Far North West -87.796833 41.989884 NaN NaN NaN NaN NaN NaN NaN 0 NaN 30
2 2 3 20.45 23:40:00 2015-01-31 5 North Park-Albany-Linconl Sq -87.711023 41.979308 NaN NaN NaN NaN NaN NaN NaN 0 NaN 30
3 3 4 21.07 23:40:00 2015-01-31 5 Edge Water-Uptown -87.660514 41.979308 NaN NaN NaN NaN NaN NaN NaN 0 NaN 30
4 4 5 23.86 23:40:00 2015-01-31 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969 NaN NaN NaN NaN NaN NaN NaN 0 NaN 30

In [8]:
## Combine Date and Time into DateTime
import datetime as dt

def getdatetime(row):
    return datetime(row['DATE'].year, row['DATE'].month, row['DATE'].day, row['TIMESTAMP'].hour, row['TIMESTAMP'].minute, row['TIMESTAMP'].second)

## Filter for only one game day
df_formatted_data = df_formatted_data[(df_formatted_data['DATE'] == dt.date(2015, 01, 03))]

## Calculate the required fields
df_datasample = df_formatted_data[['TIMESTAMP','REGION_ID','SPEED']]
df_datasample['Congestion'] = df_datasample['SPEED'].map(lambda x: 30 - x)
df_datasample['DateTime'] = df_datasample.apply(getdatetime, axis=1)

## Get rid of unnecessary columns
if 'SPEED' in df_datasample.columns:
    df_datasample.drop('SPEED', axis=1, inplace=True)
    

df_datasample.head(5)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-268fb3d4cff9> in <module>()
     11 df_datasample = df_formatted_data[['TIMESTAMP','REGION_ID','SPEED']]
     12 df_datasample['Congestion'] = df_datasample['SPEED'].map(lambda x: 30 - x)
---> 13 df_datasample['DateTime'] = df_datasample.apply(overtime, axis=1)
     14 
     15 ## Get rid of unnecessary columns

NameError: name 'overtime' is not defined

In [ ]: