scraping_debates



In [1]:
import requests, lxml.html
import pandas as pd

pd.set_option('display.max_colwidth', 100)

In [2]:
response = requests.get("http://www.presidency.ucsb.edu/debates.php")
doc = lxml.html.fromstring(response.content)
rows = []
for el in doc.cssselect("td.doctext"):
    row = el.getparent()
    date = row.cssselect("td.docdate")[0].text_content()
    debate = row.cssselect("td.doctext")[0].text_content()
    link = ""
    if row.cssselect("td.doctext a"):
        link = row.cssselect("td.doctext a")[0].get('href')
    row = [date, debate, link]
    rows.append(row)
df = pd.DataFrame(rows, columns=['date', 'debate', 'link'])
df['date'] = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df[df.link != ""].head()


Out[2]:
date debate link year
4 2016-04-14 Democratic Candidates Debate in Brooklyn, New York http://www.presidency.ucsb.edu/ws/index.php?pid=116995 2016
5 2016-03-09 Democratic Candidates Debate in Miami, Florida http://www.presidency.ucsb.edu/ws/index.php?pid=112719 2016
6 2016-03-06 Democratic Candidates Debate in Flint, Michigan http://www.presidency.ucsb.edu/ws/index.php?pid=112718 2016
7 2016-02-11 Democratic Candidates Debate in Milwaukee, Wisconsin http://www.presidency.ucsb.edu/ws/index.php?pid=111520 2016
8 2016-02-04 Democratic Candidates Debate in Durham, New Hampshire http://www.presidency.ucsb.edu/ws/index.php?pid=111471 2016

In [3]:
df[df.year >= 2015].sort_values(by='year').head(1)


Out[3]:
date debate link year
31 2015-08-06 Republican Candidates "Undercard" Debate in Cleveland, Ohio http://www.presidency.ucsb.edu/ws/index.php?pid=110757 2015

In [4]:
df[df.year >= 2012].sort_values(by='year').head(1)


Out[4]:
date debate link year
42 2012-01-07 Republican Candidates Debate in Manchester, New Hampshire http://www.presidency.ucsb.edu/ws/index.php?pid=98813 2012

In [5]:
df[df.year >= 2011].sort_values(by='year').head(1)


Out[5]:
date debate link year
55 2011-06-13 Republican Candidates Debate in Manchester, New Hampshire http://www.presidency.ucsb.edu/ws/index.php?pid=90513 2011

In [6]:
df[df.year >= 2008].sort_values(by='year').head(1)


Out[6]:
date debate link year
83 2008-01-05 Republican Candidates Debate in Manchester, New Hampshire http://www.presidency.ucsb.edu/ws/index.php?pid=76223 2008

Keep Scrolling Below at your own Peril!


In [7]:
class Utterance(object):
    def __init__(self, speaker, text):
        self.speaker = speaker
        self.text = text
    
    def __str__(self):
        return self.speaker + ": " + self.text
    
    def __repr__(self):
        return str((self.speaker, self.text))

def parse_debate(url):

    response = requests.get(url)
    doc = lxml.html.fromstring(response.content)

    text = doc.cssselect("span.displaytext")[0]
    elements = text.getchildren()

    first_p_tag_index = None
    for i, el in enumerate(elements):
        if el.tag == "p":
            first_p_tag_index = i
            break

    assert all(el.tag == "p" for el in elements[first_p_tag_index:])

    if "MODERATOR" in elements[first_p_tag_index].text_content():
        first_p_tag_index += 1

    p_tags = elements[first_p_tag_index:]

    utterances = []

    current_speaker = None

    for p_tag in p_tags:

        if p_tag.cssselect("b"):
            b_tag = p_tag.cssselect("b")[0]
            current_speaker = b_tag.text_content()
            current_text = p_tag.text_content().replace(current_speaker, "", 1).strip()
            current_speaker = current_speaker.rstrip(":")
        else:
            current_text = p_tag.text_content()

        if len(utterances) > 0 and utterances[-1].speaker == current_speaker:
            utterances[-1].text += "\n"
            utterances[-1].text += current_text
        else:
            utterance = Utterance(current_speaker, current_text)    
            utterances.append(utterance)
    
    return utterances

In [8]:
# April 14, 2016 - Democratic Candidates Debate in Brooklyn, New York
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=116995")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")


('BLITZER', "Secretary Clinton and Senator Sanders, you can now move to your lecterns while I explain a few ground rules. As moderator, I'll guide the discussion, asking questions and follow-ups. You'll also get questions from Dana Bash and Errol Louis. You'll each have one minute and 15 seconds to answer questions, 30 seconds for follow- ups. Timing lights will signal when your time is up. Both candidates have agreed to these rules now. Opening statements, you'll each have two minutes.\nLet's begin with Senator Sanders. [applause] ")
------------------
('SANDERS', "Wolf, thank you very much. CNN, thank you very much. Secretary Clinton, thank you very much.\nWhen we began this campaign almost a year ago, we started off at 3 percent in the polls. We were about 70 points behind Secretary Clinton. In the last couple of weeks, there were two polls out there that had us ahead. [applause] \nOf the last nine caucuses and primaries, we have won eight of them, many of them by landslide victories. [applause] \nOver the last year, we have received almost 7 million individual campaign contributions, averaging — guess what — $27 apiece, more individual campaign contributions than any candidate in American history at this point in a campaign.\nThe reason that our campaign has done so well is because we're doing something very radical: We're telling the American people the truth. And the truth is that this country is not going to move forward in a significant way for working people unless we overturn this disastrous Citizens United Supreme Court decision...[applause]... and unless we have real campaign reform so that billionaires and super PACs cannot buy elections. [applause] \nThis campaign is also determined to end a rigged economy where the rich get richer and everybody else get poorer, and create an economy that works for all of us, not just the 1 percent.\nThank you.")
------------------
('BLITZER', 'Secretary Clinton?')
------------------

In [9]:
# 2015-08-06 - Republican Candidates "Undercard" Debate in Cleveland, Ohio
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=110757")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")


('HEMMER', "This is first official event in the campaign for the Republican nomination for president. Welcome to Cleveland Ohio. It is debate night.\nI'm Bill Hemmer. ")
------------------
('MacCALLUM', "And I'm Martha MacCallum.\nIt all starts here. We are ready, the candidates are ready. We're live at the Quicken Loans Arena, where we have partnered with Facebook to bring you, the voter, into today's debate. ")
------------------
('HEMMER', "So you will hear from all 17 candidates tonight, and you'll meet seven of them right now, starting with three-time governor in the state of Texas, Rick Perry. [applause]")
------------------

In [10]:
# 2012-01-07 - http://www.presidency.ucsb.edu/ws/index.php?pid=98813
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=98813")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")


('SAWYER', 'And good evening to all of you. Welcome to Saint Anselm College and the first debate of the year, 2012. The voting is underway. And, George, those eight votes in Iowa reminded us on Tuesday every vote counts.')
------------------
('STEPHANOPOULOS', "No question about it, we are off and running. Great to be here with you, Josh. And now let's introduce the candidates: former Governor Jon Huntsman; Texas Congressman Ron Paul; former Governor of Massachusetts Mitt Romney; former Senator from Pennsylvania Rick Santorum; the former speaker of the House, Newt Gingrich; and Texas Governor Rick Perry.")
------------------
('SAWYER', "And it is time to remind everyone again of the rules, which are pretty straightforward, and we remind you again, they were negotiated and agreed to by the candidates themselves. So let's take you through them.\nOne-minute responses to the question, with 30 seconds for rebuttal. And we're showing everybody at home that the candidates will see green, and then when there's 15 seconds left, it will turn yellow and red when the time is up. \nOur audience was chosen by Saint Anselm College and WMUR. And all of you at home can watch on abcnews.com and yahoo.com. You can even join the discussion by downloading Yahoo's IntoNow app on your iPhone. You can pitch in your opinions during the debate.\nSo lets the -- let the debate begin.\nAnd, Governor Romney, we'll begin with you. We just saw 200,000 new jobs created last month, and there are optimists who say this is the signal that this economy is finally turning around. Are you with those optimists? ")
------------------

In [11]:
# 2011-06-13 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=90513")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")


('JOHN KING', "Welcome to Saint Anselm College in Manchester, New Hampshire, and the first Republican presidential debate in this first-in-the-nation primary state. Behind me on this stage, the Republican candidates for president appearing together on the same stage for the first time tonight.\nAnd tonight's debate will be different than any presidential debate you've ever seen. Over the course of the next two hours, in addition to questions from myself and journalists from our partners, WMUR and the New Hampshire Union Leader, the candidates will take questions directly from voters right here in Manchester, as well as from voters at town meetings taking place tonight all across New Hampshire. \nSo let's get right to it and meet the candidates. Now, we've asked for no opening statements. However, we will continue a tradition from our past New Hampshire debates, to ask each candidate in one short sentence -- hopefully, five, maybe six or seven seconds -- to introduce themselves to the voters of New Hampshire and the United States of America. \nLet me begin with an example. I'm John King with CNN. I am honored to be your moderator tonight, and I am thrilled to be back in Red Sox nation. \n[Applause]\nNow, let's start at the edge of the stage with Senator Rick Santorum. ")
------------------
('SANTORUM', "Hello, New Hampshire. I'm Rick Santorum. I served 12 years representing Pennsylvania in the United States Senate, but I also have substantial executive experience making the tough decisions and balancing budgets and cutting spending. Karen and I are the parents of seven children.\n[Applause]")
------------------
('KING', 'Congresswoman?')
------------------

In [12]:
# 2008-01-05 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=76223")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")


('DIANE SAWYER', "Well, it is time for the great debate to begin. Let's go to Charlie Gibson at St. Anselm College in Manchester, New Hampshire.\nCharlie? ")
------------------
('CHARLIE GIBSON', 'Thanks very much, Diane.\nAnd we have been joined on the stage by the six leading Republican candidates for the Republican nomination for president. And I want to introduce them to you from left to right. \nThe positions in which they sit were drawn by lot, and so let me introduce them from left to right: Senator John McCain from the state of Arizona, former Senator Fred Thompson from Tennessee, Congressman Ron Paul from Texas, former Governor Mitt Romney of Massachusetts, former Governor Mike Huckabee from Arkansas, and former Mayor Rudy Giuliani from the city of New York. \nAnd, gentlemen, just at the risk of being repetitive, I hope you will take the questions posed in these first 45 minutes and I hope, to the extent we can, discuss them among yourselves. ')
------------------
('MR. GIBSON', 'This is not about me asking questions, as I told the national audience a few moments ago, but about you talking to one another, pointing up the differences between yourselves.\nI hope you will think of this as sort of a semi-circular dining room table. We were a little chintzy on the food, but I hope you will look at it that way. \n(LAUGHTER) \nI thank you all for being here and I genuinely look forward to this, so let us begin. And I\'ll start the stop-watch. \nPresident Bush said in his end-of-the-year news conference, "During the primaries and during the general election I suspect my name may come up a lot." So let\'s bring it up. \nI want to start with foreign policy. Just to set some context, we\'ve got a little background here from ABC\'s Jonathan Karl. \n(BEGIN VIDEOTAPE) ')
------------------