In [1]:
    
import requests, lxml.html
import pandas as pd
pd.set_option('display.max_colwidth', 100)
    
In [2]:
    
response = requests.get("http://www.presidency.ucsb.edu/debates.php")
doc = lxml.html.fromstring(response.content)
rows = []
for el in doc.cssselect("td.doctext"):
    row = el.getparent()
    date = row.cssselect("td.docdate")[0].text_content()
    debate = row.cssselect("td.doctext")[0].text_content()
    link = ""
    if row.cssselect("td.doctext a"):
        link = row.cssselect("td.doctext a")[0].get('href')
    row = [date, debate, link]
    rows.append(row)
df = pd.DataFrame(rows, columns=['date', 'debate', 'link'])
df['date'] = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df[df.link != ""].head()
    
    Out[2]:
In [3]:
    
df[df.year >= 2015].sort_values(by='year').head(1)
    
    Out[3]:
In [4]:
    
df[df.year >= 2012].sort_values(by='year').head(1)
    
    Out[4]:
In [5]:
    
df[df.year >= 2011].sort_values(by='year').head(1)
    
    Out[5]:
In [6]:
    
df[df.year >= 2008].sort_values(by='year').head(1)
    
    Out[6]:
In [7]:
    
class Utterance(object):
    def __init__(self, speaker, text):
        self.speaker = speaker
        self.text = text
    
    def __str__(self):
        return self.speaker + ": " + self.text
    
    def __repr__(self):
        return str((self.speaker, self.text))
def parse_debate(url):
    response = requests.get(url)
    doc = lxml.html.fromstring(response.content)
    text = doc.cssselect("span.displaytext")[0]
    elements = text.getchildren()
    first_p_tag_index = None
    for i, el in enumerate(elements):
        if el.tag == "p":
            first_p_tag_index = i
            break
    assert all(el.tag == "p" for el in elements[first_p_tag_index:])
    if "MODERATOR" in elements[first_p_tag_index].text_content():
        first_p_tag_index += 1
    p_tags = elements[first_p_tag_index:]
    utterances = []
    current_speaker = None
    for p_tag in p_tags:
        if p_tag.cssselect("b"):
            b_tag = p_tag.cssselect("b")[0]
            current_speaker = b_tag.text_content()
            current_text = p_tag.text_content().replace(current_speaker, "", 1).strip()
            current_speaker = current_speaker.rstrip(":")
        else:
            current_text = p_tag.text_content()
        if len(utterances) > 0 and utterances[-1].speaker == current_speaker:
            utterances[-1].text += "\n"
            utterances[-1].text += current_text
        else:
            utterance = Utterance(current_speaker, current_text)    
            utterances.append(utterance)
    
    return utterances
    
In [8]:
    
# April 14, 2016 - Democratic Candidates Debate in Brooklyn, New York
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=116995")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")
    
    
In [9]:
    
# 2015-08-06 - Republican Candidates "Undercard" Debate in Cleveland, Ohio
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=110757")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")
    
    
In [10]:
    
# 2012-01-07 - http://www.presidency.ucsb.edu/ws/index.php?pid=98813
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=98813")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")
    
    
In [11]:
    
# 2011-06-13 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=90513")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")
    
    
In [12]:
    
# 2008-01-05 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=76223")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")