Second step in creating a talk recommender for Pycon. Get last year's talk data for bigger a corpus.
The rest of the project can be found on github: https://github.com/mikecunha/pycon_reco
Version info from watermark and pip freeze at the end of the notebook
In [1]:
from datetime import datetime
from time import sleep
import re
from pprint import PrettyPrinter
from urllib.request import urlopen
from bs4 import BeautifulSoup # http://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4.element import NavigableString
from markdown import markdown
import pyprind # progress bar, e.g. here: http://nbviewer.ipython.org/github/rasbt/pyprind/blob/master/examples/pyprind_demo.ipynb
import pandas as pd
In [5]:
talk_sched_html = urlopen("https://us.pycon.org/2014/schedule/talks/")
tut_sched_html = urlopen("https://us.pycon.org/2014/schedule/tutorials/")
talk_sched_soup = BeautifulSoup( talk_sched_html.read() )
tut_sched_soup = BeautifulSoup( tut_sched_html.read() )
In [6]:
to_scrape = []
talk_links = talk_sched_soup.select("td.slot-talk span.title a")
tut_links = tut_sched_soup.select("td.slot-tutorial span.title a")
for t in talk_links + tut_links:
to_scrape.append( t.attrs.get('href') )
list(enumerate(to_scrape))[-5:]
Out[6]:
In [7]:
# Scrape all the talk html pages
soups = {}
perc = pyprind.ProgPercent( len(to_scrape) )
for relative_url in to_scrape:
perc.update()
uri = "https://us.pycon.org" + relative_url
talk_html = urlopen( uri )
soups[uri] = BeautifulSoup( talk_html.read() )
sleep(0.5) # Be nice.
In [8]:
talks = []
for uri, soup in soups.items():
talk = {}
content = soup.find(attrs={"class":"box-content"})
elements = content.find_all("dd")
talk['level'], talk['category'] = [ e.get_text(strip=True) for e in elements ]
elements = content.find_all("h4")
talk['str_time'], talk['author'] = [ e.get_text(strip=True) for e in elements ]
talk['desc'] = soup.find(attrs={"class":"description"}).get_text(strip=True)
# Abstracts contain some unparsed markdown
abstract = soup.find(attrs={"class":"abstract"}).get_text(strip=True)
html = markdown( abstract )
abstract = ''.join(BeautifulSoup(html).findAll(text=True))
talk['abstract'] = abstract.replace("\n"," ")
talk['title'] = content.find("h2").get_text(strip=True)
talks.append( talk )
talks = pd.DataFrame( talks )
talks.head()
Out[8]:
In [11]:
day_to_date = {'Wednesday': 'Apr 9 2014 ',
'Thursday': 'Apr 10 2014 ',
'Friday': 'Apr 11 2014 ',
'Saturday': 'Apr 12 2014 ',
'Sunday': 'Apr 13 2014 ',
}
def parse_dt( dt ):
""" Convert string to datetime """
day, t = [ x.strip() for x in dt.split('\n') ]
start, end = [ x.replace('.', '').replace(' ','').upper() for x in t.split("–") ]
if end == "NOON":
end = "12:00PM"
elif end.find(':') < 0:
end = end[0] + ":00" + end[-2:]
if start.find(':') < 0:
start = start[0] + ":00" + start[-2:]
try:
start = datetime.strptime( day_to_date[day] + start + ' EDT', '%b %d %Y %I:%M%p %Z' )
except ValueError:
print ("error converting start time: ", start)
try:
end = datetime.strptime( day_to_date[day] + end + ' EDT', '%b %d %Y %I:%M%p %Z' )
except ValueError:
print ("error converting end time: ", end)
return day, start, end
In [12]:
talks["weekday"], talks["start_dt"], talks["end_dt"] = zip(*talks["str_time"].map(parse_dt))
del talks["str_time"]
In [13]:
talks.head()
Out[13]:
In [33]:
talks.to_csv( 'data/pycon_talks_2014.csv', sep="\t", index=False )
In [33]:
try:
%load_ext watermark
except ImportError as e:
%install_ext https://raw.githubusercontent.com/rasbt/python_reference/master/ipython_magic/watermark.py
%load_ext watermark
%watermark
In [1]:
import pip
sorted(["%s==%s" % (i.key, i.version) for i in pip.get_installed_distributions()])
Out[1]:
In [ ]: