In [1]:
import requests as rq
import pandas as pd
import matplotlib.pyplot as mpl
import bs4
import os
from tqdm import tqdm_notebook
from datetime import time
In [2]:
%matplotlib inline
In [3]:
base_url = "https://pydata.org"
r = rq.get(base_url + "/berlin2018/schedule/")
bs = bs4.BeautifulSoup(r.text, "html.parser")
Let's query every talk description:
In [4]:
data = {}
for ahref in tqdm_notebook(bs.find_all("a")):
if 'schedule/presentation' in ahref.get("href"):
url = ahref.get("href")
else:
continue
data[url] = {}
resp = bs4.BeautifulSoup(rq.get(base_url + url).text, "html.parser")
title = resp.find("h2").text
resp = resp.find_all(attrs={'class':"container"})[1]
when, who = resp.find_all("h4")
date_info = when.string.split("\n")[1:]
day_info = date_info[0].strip()
time_inf = date_info[1].strip()
room_inf = date_info[3].strip()[3:]
speaker = who.find("a").text
level = resp.find("dd").text
abstract = resp.find(attrs={'class':'abstract'}).text
description = resp.find(attrs={'class':'description'}).text
data[url] = {
'day_info': day_info,
'title': title,
'time_inf': time_inf,
'room_inf': room_inf,
'speaker': speaker,
'level': level,
'abstract': abstract,
'description': description
}
Okay, make a dataframe and add some helpful columns:
In [5]:
df = pd.DataFrame.from_dict(data, orient='index')
df.reset_index(drop=True, inplace=True)
# Tutorials on Friday
df.loc[df.day_info=='Friday', 'tutorial'] = True
df['tutorial'].fillna(False, inplace=True)
# time handling
df['time_from'], df['time_to'] = zip(*df.time_inf.str.split(u'\u2013'))
df.time_from = pd.to_datetime(df.time_from).dt.time
df.time_to = pd.to_datetime(df.time_to).dt.time
del df['time_inf']
df.to_json('./data.json')
df.head(3)
Out[5]:
In [6]:
# Example: Let's query all non-novice talks on sunday, starting at 4 pm
tmp = df.query("(level!='Novice') & (day_info=='Sunday')")
tmp[tmp.time_from >= time(16)]
Out[6]:
In [7]:
plt.style.use('seaborn-darkgrid')#'seaborn-darkgrid')
In [8]:
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 10, 5
plt.rcParams['axes.labelsize'] = 17
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
plt.rcParams['text.latex.preamble'] = "\\usepackage{subdepth}, \\usepackage{type1cm}"
plt.rcParams['text.usetex'] = True
In [9]:
ax = df.level.value_counts().plot.bar(rot=0)
ax.set_ylabel("number of talks")
ax.set_title("levels of the talks where:")
plt.show()
In [11]:
ax = df.rename(columns={'day_info': 'dayinfo'}).groupby("dayinfo")['level'].value_counts(normalize=True).round(2).unstack(level=0).plot.bar(rot=0)
ax.set_xlabel('')
ax.set_title('So the last day is more kind of "fade-out"?')
plt.show()
In [12]:
ax = df.groupby("tutorial")['level'].value_counts(normalize=True).round(2).unstack(level=0).T.plot.bar(rot=0)
ax.set_title('the percentage of experienced slots is higher for tutorials!\n\\small{So come on fridays for experienced level ;-)}')
plt.show()