In [1]:
import requests, lxml.html
import pandas as pd
pd.set_option('display.max_colwidth', 100)
In [2]:
response = requests.get("http://www.presidency.ucsb.edu/debates.php")
doc = lxml.html.fromstring(response.content)
rows = []
for el in doc.cssselect("td.doctext"):
row = el.getparent()
date = row.cssselect("td.docdate")[0].text_content()
debate = row.cssselect("td.doctext")[0].text_content()
link = ""
if row.cssselect("td.doctext a"):
link = row.cssselect("td.doctext a")[0].get('href')
row = [date, debate, link]
rows.append(row)
df = pd.DataFrame(rows, columns=['date', 'debate', 'link'])
df['date'] = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df[df.link != ""].head()
Out[2]:
In [3]:
df[df.year >= 2015].sort_values(by='year').head(1)
Out[3]:
In [4]:
df[df.year >= 2012].sort_values(by='year').head(1)
Out[4]:
In [5]:
df[df.year >= 2011].sort_values(by='year').head(1)
Out[5]:
In [6]:
df[df.year >= 2008].sort_values(by='year').head(1)
Out[6]:
In [7]:
class Utterance(object):
def __init__(self, speaker, text):
self.speaker = speaker
self.text = text
def __str__(self):
return self.speaker + ": " + self.text
def __repr__(self):
return str((self.speaker, self.text))
def parse_debate(url):
response = requests.get(url)
doc = lxml.html.fromstring(response.content)
text = doc.cssselect("span.displaytext")[0]
elements = text.getchildren()
first_p_tag_index = None
for i, el in enumerate(elements):
if el.tag == "p":
first_p_tag_index = i
break
assert all(el.tag == "p" for el in elements[first_p_tag_index:])
if "MODERATOR" in elements[first_p_tag_index].text_content():
first_p_tag_index += 1
p_tags = elements[first_p_tag_index:]
utterances = []
current_speaker = None
for p_tag in p_tags:
if p_tag.cssselect("b"):
b_tag = p_tag.cssselect("b")[0]
current_speaker = b_tag.text_content()
current_text = p_tag.text_content().replace(current_speaker, "", 1).strip()
current_speaker = current_speaker.rstrip(":")
else:
current_text = p_tag.text_content()
if len(utterances) > 0 and utterances[-1].speaker == current_speaker:
utterances[-1].text += "\n"
utterances[-1].text += current_text
else:
utterance = Utterance(current_speaker, current_text)
utterances.append(utterance)
return utterances
In [8]:
# April 14, 2016 - Democratic Candidates Debate in Brooklyn, New York
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=116995")
for utterance in utterances[:3]:
print(repr(utterance))
print("------------------")
In [9]:
# 2015-08-06 - Republican Candidates "Undercard" Debate in Cleveland, Ohio
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=110757")
for utterance in utterances[:3]:
print(repr(utterance))
print("------------------")
In [10]:
# 2012-01-07 - http://www.presidency.ucsb.edu/ws/index.php?pid=98813
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=98813")
for utterance in utterances[:3]:
print(repr(utterance))
print("------------------")
In [11]:
# 2011-06-13 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=90513")
for utterance in utterances[:3]:
print(repr(utterance))
print("------------------")
In [12]:
# 2008-01-05 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=76223")
for utterance in utterances[:3]:
print(repr(utterance))
print("------------------")