In [3]:
#import necessary python modules
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 6)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import datetime
We will use three different data sources for this experiment from the Open Data portal of the WU
The All Course-Events during WS15 at WU Vienna dataset containing information about lecture events. We will use this data to get the lecture rooms, buildings and the end times of the courses.
In [4]:
course_events='http://data.wu.ac.at/dataset/812a45f5-3f69-457a-b496-dbd598456829/resource/bfe6cda0-258f-47b2-876d-2913e5459a66/download/allcoursesandevents15w.csv'
The All Campus Rooms at WU Vienna dataset containing information about lecture rooms. We will use this data to get the capacity of a lecture room.
In [5]:
room_info='http://data.wu.ac.at/dataset/fed3bae6-397c-4f4c-9c14-15aa8443d268/resource/d17a0d32-562a-4b37-9f32-ce06c4482583/download/allcampusrooms.csv'
The Mapping of campus rooms and main entrances at WU Vienna dataset containing information about the campus rooms and the main entrance of the building. We will use this data to get the geo-locations of the entraces of the lecture rooms.
In [6]:
entrance_coord='http://data.wu.ac.at/dataset/cc76ca38-a904-4909-b621-1f7be63b821b/resource/8b67906a-7459-46eb-b4e0-ee312150a330/download/entrancesroomsmapping.csv'
In [7]:
#get the course
d_courseEvents= pd.read_csv(course_events,parse_dates=['start', 'end'], dtype={'course_id': object})
d_courseEvents.head()
Out[7]:
In [8]:
#get room information
d_roomInfo=pd.read_csv(room_info)
d_roomInfo.head()
Out[8]:
In [9]:
#load data about entrance coordinates for the lecture rooms
d_entrCoords= pd.read_csv(entrance_coord, dtype={'course_id': object})
d_entrCoords.head()
Out[9]:
In [10]:
#merge lecture events and room information on roomcode
merge = d_roomInfo.merge(d_courseEvents, on='roomcode')
#select the necessary columns
merge = merge[['roomcode', 'location_id', 'end','capacity', ]].copy()
merge.head()
Out[10]:
In [11]:
#combine the location information data based on the location_id
data= pd.merge(merge, d_entrCoords[['location_id', 'latitude', 'longitude']], on=['location_id'])
data.head()
Out[11]:
In [12]:
# We assume that a lecture room is occupied by around 75% of its capacity
ratio_of_students_in_room=0.75
# Further, we assume that only 50% of the students in a lecture room will go to the mensa
ratio_of_students_going_to_mensa=0.5
data['est_students'] = data['capacity'] * ratio_of_students_in_room * ratio_of_students_going_to_mensa
data.sort(['est_students'], ascending=[0]).head()
Out[12]:
In [13]:
time_idx = pd.DatetimeIndex(data.end)
data['date'] = time_idx.date.astype('datetime64')
data['weekday'] = time_idx.weekday
data['hour'] = time_idx.hour
data['min'] = time_idx.minute
data.head()
Out[13]:
In [113]:
d_idx=data.set_index(['end'])
hourly = d_idx.resample('h',how = 'sum')
daily = hourly.resample('d', 'sum')
weekly = d_idx.resample('w', 'sum')
weekly[['est_students']].plot(kind='bar')
Out[113]:
In [115]:
days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
daily['dayofweek'] = daily['est_students'].index.dayofweek
daily['hour'] = daily['est_students'].index.hour
grouped = daily.groupby(['dayofweek'])['est_students'].mean()
grouped.index = days
grouped.plot(kind='bar')
plt.title("Average Estimated Number of Studends By Day")
plt.ylabel("Average number");
In [130]:
hourly['hour'] = hourly['est_students'].index.hour
hourly['weekday'] = hourly['est_students'].index.weekday
grouped = hourly.groupby(['hour'])['est_students'].mean()
grouped.plot(kind='bar')
plt.title("Average Estimated Number of Studends By Day")
plt.ylabel("Average number");
#hourly['capacity']['mean'].head()
Out[130]:
In [188]:
g_h= hourly.groupby(['hour','weekday']).mean().reset_index()
#['capacity'].plot(kind='bar')
#hourly.plot()
#g_h=g_h.fillna(0)
g_h['avoid']= g_h['est_students']>200
ax=g_h.plot(kind='scatter ', x='hour', y='weekday', s=g_h['est_students'], c='est_students');
#g_h[(g_h['est_students']>0) & (g_h['est_students']<200)].plot(kind='scatter ', x='hour', y='weekday', s=g_h['est_students'],ax=ax,);
#g_h[['weekday','hour', 'capacity']]
g_h[g_h['est_students']>200].head()
Out[188]:
In [192]:
from bokeh.plotting import figure, gridplot, output_notebook, show
from bokeh.charts import Scatter
output_notebook()
In [197]:
p = Scatter(g_h, x='hour', y='weekday', title="HP vs MPG", color="est_students",
xlabel="Miles Per Gallon", ylabel="Horsepower")
show(p)
In [ ]:
#get current date and weekday
today = datetime.date.today()
weekday = today.weekday()
#get start and end of this week
start_of_week = today - datetime.timedelta(days=weekday)
end_of_week = start_of_week + datetime.timedelta(days=5)
#generate filter for lectures which happen this week
mask = (data['end'] > start_of_week) & (data['end'] <= end_of_week)
cur_week= data.loc[mask].copy()
cur_week.head()
In [ ]:
#filter for all lectures between 11 and 15
mask = (cur_week['hour'] > 11) & (cur_week['hour'] <= 15)
filtered_week= cur_week.loc[mask].copy()
filtered_week.head(5)
In [ ]:
#groupby and aggregate
g_wh = filtered_week.groupby(['weekday', 'hour','min']).sum()
g_wh=g_wh[['est_students']]
#The top 10 worst times to go to the mensa
g_wh.sort(['est_students'], ascending=[0]).head(10)
In [ ]:
#sort by estimated #of students
ax=g_wh.sort(['est_students'], ascending=[0]).plot(y=['est_students'], kind='bar', legend=False, title="Maximum number of students")
ax.set_xlabel("day and hour of week",fontsize=26)
ax.set_ylabel("#students",fontsize=22)
plt.show()
The mensa building is D1 and we get the coordinats from the All main entrances at WU Vienna dataset (http://data.wu.ac.at/dataset/entrances/resource/f92ecfd9-3f0a-4dfd-b6ff-1b0acf0b5340)
In [ ]:
mensa_lat=48.21330080810941
mensa_long=16.409165877046618
In [ ]:
import math
#from http://www.johndcook.com/blog/python_longitude_latitude/
def distance_on_unit_sphere(lat1, long1, lat2, long2):
# Convert latitude and longitude to
# spherical coordinates in radians.
degrees_to_radians = math.pi/180.0
# phi = 90 - latitude
phi1 = (90.0 - lat1)*degrees_to_radians
phi2 = (90.0 - lat2)*degrees_to_radians
# theta = longitude
theta1 = long1*degrees_to_radians
theta2 = long2*degrees_to_radians
# Compute spherical distance from spherical coordinates.
# For two locations in spherical coordinates
# (1, theta, phi) and (1, theta', phi')
# cosine( arc length ) =
# sin phi sin phi' cos(theta-theta') + cos phi cos phi'
# distance = rho * arc length
cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
math.cos(phi1)*math.cos(phi2))
arc = math.acos( cos )
# Remember to multiply arc by the radius of the earth
# in your favorite set of units to get length.
#convert to kilometers by multiplying it with 6373*1000
m = arc*6373*1000
#add additional 100m to cater for walking up and down in buildings
return (m + 100)
In [ ]:
filtered_week['distance'] = filtered_week.apply(lambda row: distance_on_unit_sphere(row['latitude'], row['longitude'], mensa_lat, mensa_long), axis=1)
filtered_week.head()
#distance_on_unit_sphere(48.213301, 16.409166, mensa_lat, mensa_long)
In [ ]:
#compute additional delay considering an average walking speed of 1m/s (https://www.google.at/search?client=safari&rls=en&q=how+fast+do+we+walk+on+average&ie=UTF-8&oe=UTF-8&gfe_rd=cr&ei=NDqWVrHkNcGF8QfC0L7oDw#q=how+fast+do+we+walk+on+average+meters+per+second)
avg_walking_speed=1 # 1 m/s
def estimated_arrival(end, distance, avg_walking_speed):
""" Estimate the arrival time based on the lecture end time and the time to walk from the lecture to the mensa"""
walking_time= distance*avg_walking_speed
return end+pd.Timedelta(seconds=walking_time)
filtered_week['avg_arrival_time'] = filtered_week.apply(lambda row: estimated_arrival(row['end'], row['distance'],avg_walking_speed ), axis=1)
est_data=filtered_week[['est_students','weekday','hour','min','distance','avg_arrival_time']].copy()
#build again an index on the arrival time
time_idx = pd.DatetimeIndex(est_data.avg_arrival_time)
est_data['arr_min'] = time_idx.minute
est_data.head()
In [ ]:
g_data = est_data.sort(['weekday','hour','arr_min'], ascending=[1,1,1])
g_data= g_data.groupby(['weekday', 'hour','min']).sum()
g_data['avoid'] = g_data['est_students'] > 100
g_data.head()
In [ ]:
ax=g_data.sort(['est_students'], ascending=[0]).plot( y=['est_students'],
kind='bar', legend=False,
title="Maximum number of students",
color=g_data.sort(['est_students'], ascending=[0]).avoid.map({True: 'r', False: 'k'})
,alpha=0.75)
plt.setp(ax.get_xticklabels(), rotation='vertical', fontsize=14)
plt.setp(ax.get_yticklabels(), fontsize=14)
ax.set_xlabel("date and hour of week",fontsize=22)
ax.set_ylabel("capacity",fontsize=22)
plt.show()
In [ ]:
In [ ]: