Getting Chicago Data


In [1]:
import time
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gmaps


:0: FutureWarning: IPython widgets are experimental and may change in the future.

Historical Data by Segment (Daily)


In [2]:
## GET SEGMENT DATA

def getData(filename):
    df=pd.read_csv(filename,skipinitialspace=True)
    return df

df_segment_hist=getData('Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Segment.csv')

df_segment_current=getData('Chicago_Traffic_Tracker_-_Congestion_Estimates_by_Segments.csv')

%matplotlib inline

In [3]:
df_segment_hist.head(2)


Out[3]:
TIME SEGMENTID BUS COUNT MESSAGE COUNT SPEED ID
0 12/30/2014 11:50:32 PM 1 0 0 -1 3a16f0807c8988ed7e6f7e536e05add4448c7acd
1 12/30/2014 11:50:32 PM 1 0 0 -1 3348553679debbce1bb653190f2a976892fcc078

In [4]:
import datetime as dt

## TRANSLATE TIME STAMP INTO DATE
format = '%m/%d/%Y %I:%M:%S %p'
df_segment_hist['DATE'] = df_segment_hist['TIME'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).date())

df_segment_hist['WEEKDAY'] = df_segment_hist['DATE'].map(lambda x: x.weekday()) ## Sat and Sun are 5 and 6 respectively

df_segment_hist.head(2)


Out[4]:
TIME SEGMENTID BUS COUNT MESSAGE COUNT SPEED ID DATE WEEKDAY
0 12/30/2014 11:50:32 PM 1 0 0 -1 3a16f0807c8988ed7e6f7e536e05add4448c7acd 2014-12-30 1
1 12/30/2014 11:50:32 PM 1 0 0 -1 3348553679debbce1bb653190f2a976892fcc078 2014-12-30 1

In [5]:
## Filter out records with no GPS data and look at records with and without midnight timestamps
## df_segment_hist.columns

df_segment_gzero = df_segment_hist[(df_segment_hist['SPEED'] > 0)]

df_segment_gzero.head(2)


Out[5]:
TIME SEGMENTID BUS COUNT MESSAGE COUNT SPEED ID DATE WEEKDAY
2 12/30/2014 11:50:32 PM 1 1 2 25 438597c45a69b452c41cd94eb822f5ea98574ced 2014-12-30 1
3 12/30/2014 11:50:32 PM 1 1 6 25 737422daecadf090fd40901146053c442a970454 2014-12-30 1

In [22]:
print "max:", max(df_segment_gzero['DATE']), " min:", min(df_segment_gzero['DATE'])

print df_segment_gzero['DATE'].unique().shape

datacoverage = df_segment_gzero.groupby('DATE').SEGMENTID.nunique()
datacoverage.to_csv("SegmentDataCoverage.csv")


max: 2014-12-30  min: 2013-01-15
(81,)

In [6]:
## Remove unnecessary columns (TIME, ID, BUS COUNT, MESSAGE COUNT)
if 'TIME' in df_segment_gzero.columns:
    df_segment_gzero = df_segment_gzero.drop(['TIME'],1)
    df_segment_gzero = df_segment_gzero.drop(['ID'],1)
    df_segment_gzero = df_segment_gzero.drop(['BUS COUNT                '],1)
    df_segment_gzero = df_segment_gzero.drop(['MESSAGE COUNT'],1)
    
df_segment_gzero.head(5)


Out[6]:
SEGMENTID SPEED DATE WEEKDAY
2 1 25 2014-12-30 1
3 1 25 2014-12-30 1
7 1 23 2014-12-30 1
8 1 27 2014-12-30 1
9 1 26 2014-12-30 1

In [7]:
## Remove unnecessary columns from Current file (DESCRIPTION, LAST_UPDATED)
if 'STREET' in df_segment_current.columns:
    df_segment_current = df_segment_current.drop(['STREET'],1)
    df_segment_current = df_segment_current.drop(['FROM_STREET'],1)
    df_segment_current = df_segment_current.drop(['TO_STREET'],1)
    df_segment_current = df_segment_current.drop(['LENGTH'],1)
    df_segment_current = df_segment_current.drop(['STREET_HEADING'],1)
    df_segment_current = df_segment_current.drop(['COMMENTS'],1)
    df_segment_current = df_segment_current.drop(['CURRENT_SPEED'],1)
    df_segment_current = df_segment_current.drop(['LAST_UPDATED'],1)
    
## Take average Lat Long for each region
df_segment_current['AvgLong'] = (df_segment_current['START_LONGITUDE']+df_segment_current['END_LONGITUDE'])/2
df_segment_current['AvgLat'] = (df_segment_current['START_LATITUDE']+df_segment_current['END_LATITUDE'])/2

df_segment_current.head(2)
#df_segment_current.shape


Out[7]:
SEGMENTID DIRECTION START_LONGITUDE START_LATITUDE END_LONGITUDE END_LATITUDE AvgLong AvgLat
0 1 EB -87.723160 41.793067 -87.713607 41.793141 -87.718384 41.793104
1 10 EB -87.631128 41.794397 -87.625703 41.794506 -87.628415 41.794452

In [8]:
## Group segment data by day and take average speed
import numpy as np
segmentid_groupby=df_segment_gzero.groupby(['SEGMENTID','DATE','WEEKDAY'], as_index=False)
segment_speed=segmentid_groupby.agg({"SPEED": [np.mean, np.std, np.min, np.max, np.median]})
#segment_speed.head()

df_segment_all = pd.merge(left=segment_speed,right=df_segment_current, how='left', left_on='SEGMENTID', right_on='SEGMENTID')
df_segment_all.columns = ['SEGMENTID', 'SEGMENTID2', 'DATE', 'WEEKDAY', 'SPEED', 'SPEED-SD','SPEED-MIN','SPEED-MAX','SPEED-MED','DIRECTION','START_LONGITUDE','START_LATITUDE','END_LONGITUDE','END_LATITUDE','AvgLong','AvgLat']
df_segment_all = df_segment_all[['SEGMENTID','SPEED', 'SPEED-SD','SPEED-MIN','SPEED-MAX','SPEED-MED','DATE','WEEKDAY','DIRECTION','AvgLong','AvgLat']]
df_segment_all.head()


Out[8]:
SEGMENTID SPEED SPEED-SD SPEED-MIN SPEED-MAX SPEED-MED DATE WEEKDAY DIRECTION AvgLong AvgLat
0 1 23.322581 4.762081 5 35 24 2013-01-15 1 EB -87.718384 41.793104
1 1 22.206522 4.205015 11 33 22 2013-01-16 2 EB -87.718384 41.793104
2 1 23.701149 4.289292 14 34 25 2013-01-17 3 EB -87.718384 41.793104
3 1 22.205479 4.166530 12 35 23 2013-01-18 4 EB -87.718384 41.793104
4 1 24.257143 4.598947 11 34 25 2013-01-19 5 EB -87.718384 41.793104

Load Chicago Bulls and Bears Game Data


In [34]:
## Get Chicago Bulls data
from datetime import datetime
bulls_data=getData('BullsGameData.csv')

## Extract game's date from the raw data
format = '%m/%d/%Y'
bulls_data['DATE'] = bulls_data['Date'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).date())

## Filter out null timestamps
bulls_data = bulls_data[pd.notnull(bulls_data['Time'])]

## Add GameType flag
bulls_data['GameType'] = 'Bulls'

## Take only required columns
bulls_data = bulls_data[['Date','Time','Result','OT','Lat','Long','DATE','GameType']]

bulls_data.head()


Out[34]:
Date Time Result OT Lat Long DATE GameType
0 1/1/2015 8:00p EST W NaN 41.880556 -87.674167 2015-01-01 Bulls
1 1/1/2016 8:00p EST W NaN 41.880556 -87.674167 2016-01-01 Bulls
2 1/10/2015 8:00p EST W NaN 41.880556 -87.674167 2015-01-10 Bulls
4 1/11/2016 8:00p EST L NaN 41.880556 -87.674167 2016-01-11 Bulls
5 1/12/2015 8:00p EST L NaN 41.880556 -87.674167 2015-01-12 Bulls

In [57]:
## Get Chicago Bears game data
from datetime import datetime
bears_data=getData('BearsGameData.csv')

## Extract game's date from the raw data
format = '%m/%d/%Y'
bears_data['DATE'] = bears_data['Date'].apply(lambda x: 
                                   dt.datetime.strptime(x,format).date())

## Code in regular game time
bears_data['Time'] = '1:00p EST'

## Add GameType flag
bears_data['GameType'] = 'Bears'

## Take only required columns
bears_data = bears_data[['Date','Time','Result','OT','Lat','Long','DATE','GameType']]

bears_data.head()


Out[57]:
Date Time Result OT Lat Long DATE GameType
0 9/8/2013 1:00p EST W NaN 41.8625 -87.616667 2013-09-08 Bears
1 9/15/2013 1:00p EST W NaN 41.8625 -87.616667 2013-09-15 Bears
2 10/6/2013 1:00p EST L NaN 41.8625 -87.616667 2013-10-06 Bears
3 10/10/2013 1:00p EST W NaN 41.8625 -87.616667 2013-10-10 Bears
4 11/10/2013 1:00p EST L NaN 41.8625 -87.616667 2013-11-10 Bears

In [58]:
## Merge Bulls and Bears data
event_data = pd.concat([bears_data,bulls_data])

print "Bulls: ", bulls_data.shape
print "Bears: ", bears_data.shape
print "Merge: ", event_data.shape


Bulls:  (123, 8)
Bears:  (56, 8)
Merge:  (179, 8)

In [59]:
from datetime import timedelta

##Extract game's start time from the raw data
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace('p', 'PM'))
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace(' EST', '')) #strptime doesn't work for EST

format = '%I:%M%p'
event_data['GameTime'] = event_data['Time'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).time())



def addHours(tm, numhour, numminutes):
    fulldate = datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
    fulldate += timedelta(hours=numhour)
    fulldate += timedelta(minutes=numminutes)
    return fulldate.time()


## We need to convert EST to CST
event_data['GameTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))


## Basketball game end time = 2:30 after start time with no OT. Each OT adds 15 minutes
## Add 1 hour pre- and post-game buffer
# Define game's start and end time

def overtime(row):
    if row['OT'] == 'OT':
        return addHours(row['EndTime'], 0, 15)
    elif row['OT'] == '2OT':
        return addHours(row['EndTime'], 0, 30)
    elif row['OT'] == '3OT':
        return addHours(row['EndTime'], 0, 45)
    elif row['OT'] == '4OT':
        return addHours(row['EndTime'], 1, 0)
    else:
        return row['EndTime']

event_data['StartTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data.apply(overtime, axis=1)
    
    
## Get rid of unnecessary columns
if 'Time' in event_data.columns:
    event_data.drop('Time', axis=1, inplace=True)
    event_data.drop('Date', axis=1, inplace=True)
    
event_data.head()


Out[59]:
Result OT Lat Long DATE GameType GameTime StartTime EndTime
0 W NaN 41.8625 -87.616667 2013-09-08 Bears 12:00:00 11:00:00 14:30:00
1 W NaN 41.8625 -87.616667 2013-09-15 Bears 12:00:00 11:00:00 14:30:00
2 L NaN 41.8625 -87.616667 2013-10-06 Bears 12:00:00 11:00:00 14:30:00
3 W NaN 41.8625 -87.616667 2013-10-10 Bears 12:00:00 11:00:00 14:30:00
4 L NaN 41.8625 -87.616667 2013-11-10 Bears 12:00:00 11:00:00 14:30:00

Merge Traffic with Segment Event Data


In [60]:
df_segment_event=pd.merge(left=df_segment_all,right=event_data, how='left', left_on='DATE', right_on='DATE')
df_segment_event.head()


Out[60]:
SEGMENTID SPEED SPEED-SD SPEED-MIN SPEED-MAX SPEED-MED DATE WEEKDAY DIRECTION AvgLong AvgLat Result OT Lat Long GameType GameTime StartTime EndTime
0 1 23.322581 4.762081 5 35 24 2013-01-15 1 EB -87.718384 41.793104 NaN NaN NaN NaN NaN NaN NaN NaN
1 1 22.206522 4.205015 11 33 22 2013-01-16 2 EB -87.718384 41.793104 NaN NaN NaN NaN NaN NaN NaN NaN
2 1 23.701149 4.289292 14 34 25 2013-01-17 3 EB -87.718384 41.793104 NaN NaN NaN NaN NaN NaN NaN NaN
3 1 22.205479 4.166530 12 35 23 2013-01-18 4 EB -87.718384 41.793104 NaN NaN NaN NaN NaN NaN NaN NaN
4 1 24.257143 4.598947 11 34 25 2013-01-19 5 EB -87.718384 41.793104 L OT 41.880556 -87.616667 Bulls 18:00:00 17:00:00 20:45:00

In [61]:
##Add HasGame flag if game is happening during this time, and distance to the event

def hasgame(row):
    if pd.isnull(row['StartTime']):
        return 0
    else:
        return 1
    
import math
def eventdistance(row):    ## calculates event distance in miles
    if row['HasGame'] == 0:
        return None
    else:
        distance = (3959 * math.acos(math.cos(math.radians(row['Lat'])) * math.cos(math.radians(row['AvgLat']))*math.cos(math.radians(row['AvgLong']) - math.radians(row['Long'])) + math.sin(math.radians(row['Lat'])) * math.sin(math.radians(row['AvgLat'])))) 
        return distance
        
df_segment_event['HasGame'] = df_segment_event.apply(hasgame, axis=1)
df_segment_event['EventDistance'] = df_segment_event.apply(eventdistance, axis=1)
df_segment_event['POSTEDSPEED']=30  # default Posted Speed: same for all of Chicago

df_segment_event.head()
game_data = df_segment_event[(df_segment_event['HasGame'] == 1)]
game_data.head()


Out[61]:
SEGMENTID SPEED SPEED-SD SPEED-MIN SPEED-MAX SPEED-MED DATE WEEKDAY DIRECTION AvgLong ... OT Lat Long GameType GameTime StartTime EndTime HasGame EventDistance POSTEDSPEED
4 1 24.257143 4.598947 11 34 25 2013-01-19 5 EB -87.718384 ... OT 41.880556 -87.616667 Bulls 18:00:00 17:00:00 20:45:00 1 7.995952 30
6 1 24.164948 3.724022 16 35 25 2013-01-21 0 EB -87.718384 ... NaN 41.880556 -87.616667 Bulls 19:30:00 18:30:00 22:00:00 1 7.995952 30
8 1 22.787879 3.946770 11 35 23 2013-01-23 2 EB -87.718384 ... NaN 41.880556 -87.616667 Bulls 18:00:00 17:00:00 20:30:00 1 7.995952 30
10 1 22.371134 4.837612 10 33 23 2013-01-25 4 EB -87.718384 ... NaN 41.880556 -87.616667 Bulls 18:00:00 17:00:00 20:30:00 1 7.995952 30
13 1 22.744444 4.315015 5 34 23 2013-01-28 0 EB -87.718384 ... NaN 41.880556 -87.616667 Bulls 18:00:00 17:00:00 20:30:00 1 7.995952 30

5 rows × 22 columns


In [62]:
# Export data set
df_segment_event.to_csv("df_segment_event_TimeSeries.csv")

In [ ]:


In [ ]:


In [ ]:

Historical Data by Region


In [23]:
## GET REGIONAL DATA

def getData(filename):
    df=pd.read_csv(filename,skipinitialspace=True)
    return df

df_region_hist=getData('Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Region.csv')

df_region_current=getData('Chicago_Traffic_Tracker_-_Congestion_Estimates_by_Regions.csv')

%matplotlib inline

In [3]:
df_region_hist.head(2)


Out[3]:
TIME REGION_ID BUS COUNT NUMBER OF READS SPEED ID
0 01/31/2015 11:50:26 PM 1 9 135 26.59 bf2f4db8f620658c9a084918c964883479c594f9
1 01/31/2015 11:50:26 PM 1 19 257 21.14 75455d6eb934e4ef4ba965e3a3f913e5ba1c16fe

In [4]:
df_region_hist.tail(2)


Out[4]:
TIME REGION_ID BUS COUNT NUMBER OF READS SPEED ID
3179964 01/18/2013 11:50:26 PM 29 1 14 0.00 90c77f407f063ac107dc3396e429e9da4a080944
3179965 01/18/2013 11:50:26 PM 29 97 880 23.18 6ed9793b50e023386e7ef33effeb8a0a5fcb870b

In [24]:
import datetime as dt
format = '%m/%d/%Y %I:%M:%S %p'
df_region_hist['DATE'] = df_region_hist['TIME'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).date())

df_region_hist['TIMESTAMP'] = df_region_hist['TIME'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).time())

In [26]:
## Filter out data with a midnight timestamp because we know it is primarily duplicate values
df_region_nonmidnight = df_region_hist[(df_region_hist['TIMESTAMP'] < dt.time(23, 50, 00))]


print "max:", max(df_region_nonmidnight['DATE']), " min:", min(df_region_nonmidnight['DATE'])

print df_region_nonmidnight['DATE'].unique().shape

datacoveragereg = df_region_nonmidnight.groupby('DATE').REGION_ID.nunique()
datacoveragereg.to_csv("RegionDataCoverage.csv")


max: 2015-01-31  min: 2015-01-01
(31,)

In [6]:
## Extract Hour and Minute, and round to nearest 10 minute
df_region_hist['HOUR'] = df_region_hist['TIMESTAMP'].map(lambda x: x.hour)
df_region_hist['MINUTE'] = df_region_hist['TIMESTAMP'].map(lambda x: x.minute - x.minute % 10)
df_region_hist['MONTH'] = df_region_hist['DATE'].map(lambda x: x.month)
df_region_hist['YEAR'] = df_region_hist['DATE'].map(lambda x: x.year)
df_region_hist['WEEKDAY'] = df_region_hist['DATE'].map(lambda x: x.weekday()) ## Sat and Sun are 5 and 6 respectively

def traffictime(row):
    traffictimestamp = dt.datetime(100, 1, 1, row['HOUR'], row['MINUTE'], 0)
    return traffictimestamp.time()
    
df_region_hist['TIMESTAMP'] = df_region_hist.apply(traffictime, axis=1)


df_region_hist.head(10)


Out[6]:
TIME REGION_ID BUS COUNT NUMBER OF READS SPEED ID DATE TIMESTAMP HOUR MINUTE MONTH YEAR WEEKDAY
0 01/31/2015 11:50:26 PM 1 9 135 26.59 bf2f4db8f620658c9a084918c964883479c594f9 2015-01-31 23:50:00 23 50 1 2015 5
1 01/31/2015 11:50:26 PM 1 19 257 21.14 75455d6eb934e4ef4ba965e3a3f913e5ba1c16fe 2015-01-31 23:50:00 23 50 1 2015 5
2 01/31/2015 11:50:26 PM 1 29 444 20.45 46044a944c48f049a9d909f1d7df5c772f10b8e3 2015-01-31 23:50:00 23 50 1 2015 5
3 01/31/2015 11:50:26 PM 1 21 314 22.50 e8f27a9bafc8eb535a059d689f6060602228c5d3 2015-01-31 23:50:00 23 50 1 2015 5
4 01/31/2015 11:50:26 PM 1 23 403 21.00 57bcf466f8ad54753b84f678f95a514700df0868 2015-01-31 23:50:00 23 50 1 2015 5
5 01/31/2015 11:50:26 PM 1 20 292 19.77 c48fb82bf84357e94c601c477dfb5cc627f93cee 2015-01-31 23:50:00 23 50 1 2015 5
6 01/31/2015 11:50:26 PM 1 4 41 0.00 4c21e0259230596f2e9f8c73ecefba5e18be4939 2015-01-31 23:50:00 23 50 1 2015 5
7 01/31/2015 11:50:26 PM 1 7 93 25.91 550563ab7a2c3050e53754191fe93c14d91c71b0 2015-01-31 23:50:00 23 50 1 2015 5
8 01/31/2015 11:50:26 PM 1 31 410 21.14 cd6b1fa1f3957103bdf93e8190b0f4c453c1a877 2015-01-31 23:50:00 23 50 1 2015 5
9 01/31/2015 11:50:26 PM 1 17 277 19.36 0a9e97fafc9bcd4d40b9f100240ac43301fb9396 2015-01-31 23:50:00 23 50 1 2015 5

In [7]:
## Filter out data with no GPS data and look at records with and without midnight timestamps
df_region_hist = df_region_hist.rename(columns = {'NUMBER OF READS                      ':'NUMBER OF READS'})
df_region_hist = df_region_hist[(df_region_hist['NUMBER OF READS'] > 0)]


## Filter out data with a midnight timestamp because we know it is primarily duplicate values
df_region_nonmidnight = df_region_hist[(df_region_hist['TIMESTAMP'] < dt.time(23, 50, 00))]

## Filter for only January 2015
df_region_nonmidnight = df_region_nonmidnight[(df_region_nonmidnight['MONTH'] == 1) & (df_region_nonmidnight['YEAR'] == 2015)]

In [8]:
## Remove unnecessary columns (TIME, ID, BUS COUNT, NUMBER OF READS)
if 'TIME' in df_region_nonmidnight.columns:
    df_region_nonmidnight = df_region_nonmidnight.drop(['TIME'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['ID'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['BUS COUNT'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['NUMBER OF READS'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['HOUR'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['MINUTE'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['YEAR'],1)
    df_region_nonmidnight = df_region_nonmidnight.drop(['MONTH'],1)
    
df_region_nonmidnight.head(5)
#df_region_nonmidnight.shape


Out[8]:
REGION_ID SPEED DATE TIMESTAMP WEEKDAY
4205 1 21.48 2015-01-31 23:40:00 5
4206 2 22.23 2015-01-31 23:40:00 5
4207 3 20.45 2015-01-31 23:40:00 5
4208 4 21.07 2015-01-31 23:40:00 5
4209 5 23.86 2015-01-31 23:40:00 5

Join with current data to get location


In [9]:
## Remove unnecessary columns (DESCRIPTION, LAST_UPDATED)
if 'DESCRIPTION' in df_region_current.columns:
    df_region_current = df_region_current.drop(['DESCRIPTION'],1)
    df_region_current = df_region_current.drop(['LAST_UPDATED'],1)
    
## Take average Lat Long for each region
df_region_current['AvgLong'] = (df_region_current['WEST']+df_region_current['EAST'])/2
df_region_current['AvgLat'] = (df_region_current['NORTH']+df_region_current['SOUTH'])/2

df_region_current.head(2)
#df_region_current.shape


Out[9]:
REGION REGION_ID WEST EAST SOUTH NORTH CURRENT_SPEED AvgLong AvgLat
0 Rogers Park - West Ridge 1 -87.709645 -87.654561 41.997946 42.026444 25.23 -87.682103 42.012195
1 Far North West 2 -87.846210 -87.747456 41.960669 42.019100 30.68 -87.796833 41.989884

In [10]:
df_region_all_Jan2015=pd.merge(left=df_region_nonmidnight,right=df_region_current, how='left', left_on='REGION_ID', right_on='REGION_ID')
df_region_all_Jan2015 = df_region_all_Jan2015[['REGION_ID','SPEED','TIMESTAMP','DATE','WEEKDAY','REGION','AvgLong','AvgLat']]
df_region_all_Jan2015.head(5)


Out[10]:
REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat
0 1 21.48 23:40:00 2015-01-31 5 Rogers Park - West Ridge -87.682103 42.012195
1 2 22.23 23:40:00 2015-01-31 5 Far North West -87.796833 41.989884
2 3 20.45 23:40:00 2015-01-31 5 North Park-Albany-Linconl Sq -87.711023 41.979308
3 4 21.07 23:40:00 2015-01-31 5 Edge Water-Uptown -87.660514 41.979308
4 5 23.86 23:40:00 2015-01-31 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969

Load Chicago Bulls Game Data


In [11]:
##### Merge with event data
from datetime import datetime
event_data=getData('BullsGameData.csv')

##Extract game's date from the raw data
format = '%m/%d/%Y'
event_data['DATE'] = event_data['Date'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).date())

## Filter for only January 2015
event_data['MONTH'] = event_data['DATE'].map(lambda x: x.month)
event_data['YEAR'] = event_data['DATE'].map(lambda x: x.year)
event_data = event_data[(event_data['MONTH'] == 1) & (event_data['YEAR'] == 2015)]

## Filter out null timestamps
event_data = event_data[pd.notnull(event_data['Time'])]


event_data.head(25)


Out[11]:
GameNum Date Time AwayFlag Opponent Result OT Lat Long DATE MONTH YEAR
55 33 1/1/2015 8:00p EST NaN Denver Nuggets W NaN 41.880556 -87.674167 2015-01-01 1 2015
56 34 1/3/2015 8:00p EST NaN Boston Celtics W OT 41.880556 -87.674167 2015-01-03 1 2015
57 35 1/5/2015 8:00p EST NaN Houston Rockets W NaN 41.880556 -87.674167 2015-01-05 1 2015
58 36 1/7/2015 8:00p EST NaN Utah Jazz L NaN 41.880556 -87.674167 2015-01-07 1 2015
59 38 1/10/2015 8:00p EST NaN Milwaukee Bucks W NaN 41.880556 -87.674167 2015-01-10 1 2015
60 39 1/12/2015 8:00p EST NaN Orlando Magic L NaN 41.880556 -87.674167 2015-01-12 1 2015
61 40 1/14/2015 8:00p EST NaN Washington Wizards L NaN 41.880556 -87.674167 2015-01-14 1 2015
62 42 1/17/2015 8:00p EST NaN Atlanta Hawks L NaN 41.880556 -87.674167 2015-01-17 1 2015
63 44 1/22/2015 8:00p EST NaN San Antonio Spurs W NaN 41.880556 -87.674167 2015-01-22 1 2015
64 46 1/25/2015 1:00p EST NaN Miami Heat L NaN 41.880556 -87.674167 2015-01-25 1 2015

In [12]:
##Extract game's start time from the raw data
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace('p', 'PM'))
event_data['Time'] = event_data['Time'].map(lambda x: str(x).replace(' EST', '')) #strptime doesn't work for EST

format = '%I:%M%p'
event_data['GameTime'] = event_data['Time'].apply(lambda x: 
                                    dt.datetime.strptime(x,format).time())



from datetime import timedelta
def addHours(tm, numhour, numminutes):
    fulldate = datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
    fulldate += timedelta(hours=numhour)
    fulldate += timedelta(minutes=numminutes)
    return fulldate.time()


## We need to convert EST to CST
event_data['GameTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))


## Basketball game end time = 2:30 after start time with no OT. Each OT adds 15 minutes
## Add 1 hour pre- and post-game buffer
# Define game's start and end time

def overtime(row):
    if row['OT'] == 'OT':
        return addHours(row['EndTime'], 0, 15)
    elif row['OT'] == '2OT':
        return addHours(row['EndTime'], 0, 30)
    elif row['OT'] == '3OT':
        return addHours(row['EndTime'], 0, 45)
    elif row['OT'] == '4OT':
        return addHours(row['EndTime'], 1, 0)
    else:
        return row['EndTime']

event_data['StartTime'] = event_data['GameTime'].map(lambda x: addHours(x, -1, 0))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data['GameTime'].map(lambda x: addHours(x, 2, 30))
event_data['EndTime'] = event_data.apply(overtime, axis=1)
    
    
## Get rid of unnecessary columns
if 'Time' in event_data.columns:
    event_data.drop('Time', axis=1, inplace=True)
    event_data.drop('AwayFlag', axis=1, inplace=True)
    event_data.drop('Date', axis=1, inplace=True)
    event_data.drop('GameNum', axis=1, inplace=True)
    event_data.drop('Opponent', axis=1, inplace=True)
    event_data.drop('MONTH', axis=1, inplace=True)
    event_data.drop('YEAR', axis=1, inplace=True)
    
    
event_data.head()


Out[12]:
Result OT Lat Long DATE GameTime StartTime EndTime
55 W NaN 41.880556 -87.674167 2015-01-01 19:00:00 18:00:00 21:30:00
56 W OT 41.880556 -87.674167 2015-01-03 19:00:00 18:00:00 21:45:00
57 W NaN 41.880556 -87.674167 2015-01-05 19:00:00 18:00:00 21:30:00
58 L NaN 41.880556 -87.674167 2015-01-07 19:00:00 18:00:00 21:30:00
59 W NaN 41.880556 -87.674167 2015-01-10 19:00:00 18:00:00 21:30:00

Merge Traffic with Event Data


In [13]:
df_region_Jan2015_event=pd.merge(left=df_region_all_Jan2015,right=event_data, how='left', left_on='DATE', right_on='DATE')
df_region_Jan2015_event.head()


Out[13]:
REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat Result OT Lat Long GameTime StartTime EndTime
0 1 21.48 23:40:00 2015-01-31 5 Rogers Park - West Ridge -87.682103 42.012195 NaN NaN NaN NaN NaN NaN NaN
1 2 22.23 23:40:00 2015-01-31 5 Far North West -87.796833 41.989884 NaN NaN NaN NaN NaN NaN NaN
2 3 20.45 23:40:00 2015-01-31 5 North Park-Albany-Linconl Sq -87.711023 41.979308 NaN NaN NaN NaN NaN NaN NaN
3 4 21.07 23:40:00 2015-01-31 5 Edge Water-Uptown -87.660514 41.979308 NaN NaN NaN NaN NaN NaN NaN
4 5 23.86 23:40:00 2015-01-31 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969 NaN NaN NaN NaN NaN NaN NaN

In [14]:
##Add HasGame flag if game is happening during this time, and distance to the event

def hasgame(row):
    if pd.isnull(row['StartTime']):
        return 0
    elif (row['StartTime']<=row['TIMESTAMP']) & (row['EndTime']>=row['TIMESTAMP']):
        return 1
    else:
        return 0
    
import math
def eventdistance(row):    ## calculates event distance in miles
    if row['HasGame'] == 0:
        return None
    else:
        distance = (3959 * math.acos(math.cos(math.radians(row['Lat'])) * math.cos(math.radians(row['AvgLat']))*math.cos(math.radians(row['AvgLong']) - math.radians(row['Long'])) + math.sin(math.radians(row['Lat'])) * math.sin(math.radians(row['AvgLat'])))) 
        return distance
        
df_region_Jan2015_event['HasGame'] = df_region_Jan2015_event.apply(hasgame, axis=1)
df_region_Jan2015_event['EventDistance'] = df_region_Jan2015_event.apply(eventdistance, axis=1)
df_region_Jan2015_event['POSTEDSPEED']=30  # default Posted Speed: same for all of Chicago

game_data = df_region_Jan2015_event[(df_region_Jan2015_event['DATE'] == dt.date(2015, 01, 03)) & (df_region_Jan2015_event['TIMESTAMP'] == dt.time(20, 30, 00))]
game_data.head()


Out[14]:
REGION_ID SPEED TIMESTAMP DATE WEEKDAY REGION AvgLong AvgLat Result OT Lat Long GameTime StartTime EndTime HasGame EventDistance POSTEDSPEED
116307 1 19.77 20:30:00 2015-01-03 5 Rogers Park - West Ridge -87.682103 42.012195 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 9.105076 30
116308 2 25.91 20:30:00 2015-01-03 5 Far North West -87.796833 41.989884 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 9.839918 30
116309 3 23.18 20:30:00 2015-01-03 5 North Park-Albany-Linconl Sq -87.711023 41.979308 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 7.081638 30
116310 4 20.45 20:30:00 2015-01-03 5 Edge Water-Uptown -87.660514 41.979308 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 6.859490 30
116311 5 23.86 20:30:00 2015-01-03 5 Dunning-Portage-Belmont Cragn -87.796833 41.934969 W OT 41.880556 -87.674167 19:00:00 18:00:00 21:45:00 1 7.343467 30

Export Data Set


In [31]:
df_region_Jan2015_event.to_csv("Jan2015_TimeSeries.csv")

Create sample data for UI


In [19]:
## Combine Date and Time into DateTime

def getdatetime(row):
    return dt.datetime(row['DATE'].year, row['DATE'].month, row['DATE'].day, row['TIMESTAMP'].hour, row['TIMESTAMP'].minute, row['TIMESTAMP'].second)

## Filter for only one game day
df_formatted_data = df_region_Jan2015_event[(df_region_Jan2015_event['DATE'] == dt.date(2015, 01, 05))]
df_formatted_data['DateTime'] = df_formatted_data.apply(getdatetime, axis=1)

## Calculate the required fields
df_datasample = df_formatted_data[['DateTime','REGION_ID','SPEED']]

df_datasample.head(5)


/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/IPython/kernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[19]:
DateTime REGION_ID SPEED
107486 2015-01-05 23:40:00 1 19.09
107487 2015-01-05 23:40:00 2 21.82
107488 2015-01-05 23:40:00 3 19.09
107489 2015-01-05 23:40:00 4 17.05
107490 2015-01-05 23:40:00 5 21.14

In [20]:
df_datasample.to_csv("DataSampleForUI.csv")

In [ ]: