In [1]:
#Import libraries
import dask.dataframe as dd #High-performance data frame handler
import pandas as pd
import numpy as np
#Locate file and load it as data frame with proper data types
filename = 'Road_Weather_Information_Stations.csv'
df = dd.read_csv(filename,
dtype={'StationName': str, 'StationLocation': object,
'RecordId':int, 'RoadSurfaceTemperature': float, 'AirTemperature': float},
parse_dates=['DateTime'])
#Visualize first 5 rows
df.head(5)
Out[1]:
In [2]:
#Visualize last 5 of 708,891 rows
df.tail(5)
Out[2]:
In [3]:
#Visualize data types
df._meta.dtypes #object=string
Out[3]:
Python offers powerful tools to interact with very large files, such as the dask package, but one significant costraint is the machine on which it operates. Machines with limited resources require long computational time, slowing down the working pipeline. For this reason, the City of Seattle Open Data Portal allows to quickly query and slice the dataset online. This approach was followed in order to preserve the computational resources, an option that should always be considered whenever possible.
The hottest month on average is August, so the online query covers the 01 August 2016 - 31 August 2016 period.
In [22]:
import pandas as pd
import datetime as DT
#Load the csv file for August 2016
filename='August2016_Road_Weather_Information_Stations.csv'
august=pd.read_csv(filename,
dtype={'StationName': str, 'StationLocation': object,
'RecordId':int, 'RoadSurfaceTemperature': float, 'AirTemperature': float})
In [8]:
#Visualize first 5 values
august.head(5)
Out[8]:
In [23]:
#Visualize data types
august.dtypes #object=string
Out[23]:
In [26]:
#Separate date from time
august['Date'] = [d.date() for d in august['DateTime']]
august['Time'] = [d.time() for d in august['DateTime']]
#Visualize
august.head(5)
Out[26]:
In [49]:
#Drop DateTime Air and Temperature column
august=august.drop('DateTime', axis=1)
august=august.drop('AirTemperature', axis=1)
#Sort the dataframe
august.sort_values(by=['StationName'], inplace=True)
#Set the index to be this and don't drop
august.set_index(keys=['StationName'], drop=False,inplace=True)
#Get a list of stations
stations=august['StationName'].unique().tolist()
#Create split dataframes
alaskan = august.loc[august.StationName=='AlaskanWayViaduct_KingSt']
albro = august.loc[august.StationName=='AlbroPlaceAirportWay']
aurora = august.loc[august.StationName=='AuroraBridge']
harbor = august.loc[august.StationName=='HarborAveUpperNorthBridge']
joseriza = august.loc[august.StationName=='JoseRizalBridgeNorth']
magnolia = august.loc[august.StationName=='MagnoliaBridge']
myrtle = august.loc[august.StationName=='35thAveSW_SWMyrtleSt']
ne45 = august.loc[august.StationName=='NE45StViaduct']
roosevelt = august.loc[august.StationName=='RooseveltWay_NE80thSt']
spokane = august.loc[august.StationName=='SpokaneSwingBridge']
#Create list with data frame names for reference
station_list=[alaskan,albro,aurora,harbor,joseriza,magnolia,myrtle,ne45,roosevelt,spokane]
In [94]:
#Compute average temperature for each time interval and station (e.g., time series) for August 2016
#and plot the series
import matplotlib.pyplot as plt
#Create empty image
fig = plt.figure(figsize=(15,18))
plt.subplots_adjust(hspace=0.3)
#Empty lists to store peak times, station names, and max road surface temperatures
peak=[]
names=[]
#Loop through the station list
for i, station in enumerate(station_list):
mean_t=station.groupby('Time').RoadSurfaceTemperature.mean()
#Convert Fahrenheit degrees into Celsius degrees
mean_t=mean_t.apply(lambda x: ((x - 32) * 5/9))
#Save station names, peak times, and temperatures
peak.append(mean_t.idxmax().strftime("%H:%M:%S"))
names.append(str(station.StationName[1]))
#Add plot
ax = fig.add_subplot(5, 2, i+1)
plt.scatter(mean_t.index, mean_t)
plt.title(str(station.StationName[1]))
plt.xlabel("Time")
plt.ylabel("degrees Celsius")
plt.ylim(15,40)
plt.show()
In [99]:
#Show peak times
peak_df=pd.DataFrame({'Station Name' : names, 'Peak Time': peak})
peak_df=peak_df[['Station Name','Peak Time']]
peak_df
Out[99]:
In [171]:
#Extract average road surface temperatures for the 3:50PM-4:10PM interval
#Empty dictionary to store station names and average peak road surface temperatures
peak_temp={}
#Loop through the station list
for i, station in enumerate(station_list):
mean_t=station.groupby('Time').RoadSurfaceTemperature.mean()
mean_t=mean_t.apply(lambda x: ((x - 32) * 5/9))
#Save the average temperature between 3:50PM and 4:10PM
peak_temp[str(station.StationName[1])]=round(mean_t[950:971].mean(),3)
#Visualize data frame
max_t=pd.DataFrame({'Station Name':peak_temp.keys(), 'Max Temperature': peak_temp.values()})
max_t=max_t[['Station Name', 'Max Temperature']]
max_t
Out[171]:
In [204]:
#Sensor locations
sensors=pd.read_csv('sensors.csv')
#Add temperature column
complete=pd.merge(sensors, max_t, on='Station Name', how='outer')
complete
Out[204]:
In [281]:
import numpy as np
import pandas as pd
import folium
import branca
import matplotlib.pyplot as plt
#Set coordinates
SEATTLE_COORDINATES = (47.59844, -122.33561)
#Empty map zoomed in on Seattle
map = folium.Map(location=SEATTLE_COORDINATES, zoom_start=11.48, tiles='Stamen Terrain')
#Create group of circle markers
f = folium.map.FeatureGroup()
lats=complete['Lat'].tolist()
lngs=complete['Lon'].tolist()
sizes=complete['Max Temperature'].tolist()
popup=complete['Station Name'].tolist()
#Colormap
colors=np.asarray(sizes)
cm = branca.colormap.LinearColormap(['green', 'yellow', 'red'], vmin=25, vmax=41)
cm.caption = 'Road Surface Temperature [degrees Celsius]'
map.add_child(cm)
#Add color-coded circles
for lat, lng, size, color in zip(lats, lngs, sizes, colors):
f.add_child(folium.features.CircleMarker(
[lat, lng],
radius=size,
color=None,
#popup=popup,
fill_color=cm(color)))
map.add_child(f)
#Add markers with popups
for each in complete[0:10].iterrows():
folium.Marker([each[1]['Lat'],each[1]['Lon']],
popup='Bridge: '+each[1]['Station Name']+'; Max Temp: '+str(each[1]['Max Temperature'])).add_to(map)
#Display
map
Out[281]:
In [ ]: