First step in creating a talk recommender for Pycon.
The rest of the project can be found on github: https://github.com/mikecunha/pycon_reco
Version info from watermark and pip freeze at the end of the notebook
In [2]:
from datetime import datetime
from time import sleep
import re
from pprint import PrettyPrinter
from urllib.request import urlopen
from bs4 import BeautifulSoup # http://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4.element import NavigableString
from markdown import markdown
import pyprind # progress bar, e.g. here: http://nbviewer.ipython.org/github/rasbt/pyprind/blob/master/examples/pyprind_demo.ipynb
import pandas as pd
In [7]:
sched_html = urlopen("https://us.pycon.org/2015/schedule/")
if sched_html.status != 200:
print ('Error: ', sched_html.status)
else:
sched_soup = BeautifulSoup( sched_html.read() )
In [8]:
to_scrape = []
talk_links = sched_soup.select("td.slot-talk span.title a")
tut_links = sched_soup.select("td.slot-tutorial span.title a")
for t in talk_links + tut_links:
to_scrape.append( t.attrs.get('href') )
list(enumerate(to_scrape))[-5:]
Out[8]:
In [9]:
# Scrape all the talk html pages
soups = {}
perc = pyprind.ProgPercent( len(to_scrape) )
for relative_url in to_scrape:
perc.update()
uri = "https://us.pycon.org" + relative_url
talk_html = urlopen( uri )
soups[uri] = BeautifulSoup( talk_html.read() )
sleep(0.5) # Be nice.
In [17]:
talks = []
for uri, soup in soups.items():
talk = {}
content = soup.find(attrs={"class":"box-content"})
elements = content.find_all("dd")
talk['level'], talk['category'] = [ e.get_text(strip=True) for e in elements ]
elements = content.find_all("h4")
talk['str_time'], talk['author'] = [ e.get_text(strip=True) for e in elements ]
talk['desc'] = soup.find(attrs={"class":"description"}).get_text(strip=True)
# Abstracts contain some unparsed markdown
abstract = soup.find(attrs={"class":"abstract"}).get_text(strip=True)
html = markdown( abstract )
abstract = ''.join(BeautifulSoup(html).findAll(text=True))
talk['abstract'] = abstract.replace("\n"," ")
talk['title'] = content.find("h2").get_text(strip=True)
talks.append( talk )
talks = pd.DataFrame( talks )
talks.head()
Out[17]:
In [19]:
day_to_date = {'Wednesday': 'Apr 8 2015 ',
'Thursday': 'Apr 9 2015 ',
'Friday': 'Apr 10 2015 ',
'Saturday': 'Apr 11 2015 ',
'Sunday': 'Apr 12 2015 ',
}
def parse_dt( dt ):
""" Convert string to datetime """
day, t = [ x.strip() for x in dt.split('\n') ]
start, end = [ x.replace('.', '').replace(' ','').upper() for x in t.split("–") ]
if end == "NOON":
end = "12:00PM"
elif end.find(':') < 0:
end = end[0] + ":00" + end[-2:]
if start.find(':') < 0:
start = start[0] + ":00" + start[-2:]
try:
start = datetime.strptime( day_to_date[day] + start + ' EDT', '%b %d %Y %I:%M%p %Z' )
except ValueError:
print ("error converting start time: ", start)
try:
end = datetime.strptime( day_to_date[day] + end + ' EDT', '%b %d %Y %I:%M%p %Z' )
except ValueError:
print ("error converting end time: ", end)
return day, start, end
In [20]:
talks["weekday"], talks["start_dt"], talks["end_dt"] = zip(*talks["str_time"].map(parse_dt))
del talks["str_time"]
In [21]:
talks.head()
Out[21]:
In [3]:
# grab html from the keynote page with bio's of speakers:
key_html = urlopen( "https://us.pycon.org/2015/events/keynotes/" )
key_soup = BeautifulSoup( key_html.read() )
In [23]:
auth_info = {}
# not as many unique tags for soup, so find them using regex on
# the markdown that is present
for author in key_soup.findAll(text=re.compile('.*##[^\n]*')):
start_tag = author.find_next('div')
# the bio text is between these two tags
stop_tag = author.find_next('p')
desc = ''
for elem in start_tag.next_elements:
if elem == stop_tag:
break
elif isinstance(elem, NavigableString ):
desc += elem.string
talk_name, desc = desc.strip().split("\n", 1)
author = author.strip("\r\n #")
# Deal with unique format of one author
if author == "Gabriella Coleman":
talk_name, desc = desc.strip().split("\n", 1)
auth_info[ author ] = { 'desc': desc.strip(),
'title': talk_name, }
pp = PrettyPrinter(indent=4)
pp.pprint( auth_info )
In [29]:
# Get datetimes to go along with them
weekdays = {0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
5: 'Saturday',
6: 'Sunday',
}
key_talks = []
for day_soup in sched_soup.findAll("h3"):
day = day_soup.get_text(strip=True)
day = day.replace(',','').replace('April','Apr')
days_table = day_soup.findNext("table")
keynotes = days_table.select("td.slot-lightning")
for key in keynotes:
key_title = key.get_text()
if key_title.find('Keynote') > -1 or key_title.find('Opening') > -1:
start_t = key.findPrevious("td").get_text(strip=True)
start_t = datetime.strptime( day +' '+ start_t + ' EDT', '%b %d %Y %I:%M%p %Z' )
end_t = key.findNext("td").get_text(strip=True)
end_t = datetime.strptime( day +' '+ end_t + ' EDT', '%b %d %Y %I:%M%p %Z' )
dow = weekdays[ start_t.weekday() ]
category, author = key_title.strip().split(' - ', 1)
author = author.split('- ',1)[0].strip()
talk = {'start_dt': start_t,
'end_dt': end_t,
'weekday': dow,
'author': author,
'category': category,
}
# Add in keynote titles and descriptions
for key, val in auth_info[author].items():
talk[key] = val
key_talks.append( talk )
key_talks = pd.DataFrame( key_talks )
key_talks
Out[29]:
In [30]:
# specifying columns at the end preserves column order
combined_talks = pd.concat([key_talks, talks], ignore_index=True, )[talks.columns]
combined_talks.tail()
Out[30]:
In [31]:
combined_talks.to_csv( 'data/pycon_talks_2015.csv', sep="\t", index=False )
In [33]:
try:
%load_ext watermark
except ImportError as e:
%install_ext https://raw.githubusercontent.com/rasbt/python_reference/master/ipython_magic/watermark.py
%load_ext watermark
%watermark
In [1]:
import pip
sorted(["%s==%s" % (i.key, i.version) for i in pip.get_installed_distributions()])
Out[1]:
In [ ]: