Analysis of Presidential speech and election data

This notebook scrapes The American Presidency Project and downloads the campagin speeches of all 2016 presidential candidates. It then builds a markov chain out each president's data capable of generating sentences in the style of their campaign speeches.

Scrape Campaign Speeches


In [6]:
import pandas as pd
import numpy as np
import requests
from lxml import html
from bs4 import BeautifulSoup
import markovify
import os.path
from datetime import datetime, timedelta
import calendar
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import seaborn as sns
import re

In [7]:
def getCandidateSpeechLinks(url):
    allCandidatePage = requests.get(url)
    allCandidatePageSoup = BeautifulSoup(allCandidatePage.text, 'lxml')
    links={}
    table = allCandidatePageSoup.find('table', width=680)
    for area in table.findAll('td', class_='doctext'):
        for a in area.findAll('a'):
            if ('campaign' in a.text.lower()):
                links[area.find('span', class_='roman').text] = a['href']
    return links

def scrapeCampaignSpeechesToFile(url, path):
    allSpeechPages = requests.get(url)
    allSpeechSoup=BeautifulSoup(allSpeechPages.text, 'lxml')
    root = 'http://www.presidency.ucsb.edu/'
    table = allSpeechSoup.find('table', width=700)
    links = []
    for link in table.findAll('a'):
        if('interview' not in link.text.lower()):
            links.append(root+(link['href'])[3:])

    speechPages = [requests.get(link , 'lxml')for link in links]
    speechesSoup = [BeautifulSoup(speechPage.text, 'lxml') for speechPage in speechPages]

    with open(path, "w+", encoding='utf-8') as outFile:
        outFile.seek(0)
        for i,speech in enumerate(speechesSoup):            
            text = speechesSoup[i].find('span', class_='displaytext').text.replace('.','. ')
            text = re.sub('\[[a-zA-Z]*\]', ' ', text)
            text = re.sub('[A-Z]+ [A-Z]+:', ' ', text)
            text = re.sub('\w+:', ' ', text)
            text = re.sub(r'[^\x00-\x7F]+',' ', text)
            
            outFile.write(text +'\n')


def campaignLinkToFiles(url, year):
    
    dataFolder = './Campaign Speeches/'+ str(year) +'/'
    
    if not os.path.exists(dataFolder):
        os.makedirs(dataFolder)
    
    #Create the dictionary of each candidate's name and link to their campaign speech page    
    campaignSpeechLinkDict = getCandidateSpeechLinks(url)
    
    root = 'http://www.presidency.ucsb.edu/'
    new=0
    existing=0
    newpaths=[]
    #Loops through the campagin speech links, puts each candidate's campagin speeches into individual files
    for name, url in campaignSpeechLinkDict.items():
        path = dataFolder + name.replace(' ', '-') + '.txt'
        if not os.path.isfile(path):
            new+=1
            newpaths.append(path)
            scrapeCampaignSpeechesToFile(root + url, path)
        else:
            existing+=1
    
    
    
    #print some statistics
    print(str(existing), ' files already existed, ignoring.')
    print(str(new), ' files created successfully:')
    for p in newpaths:
        print(p)

In [8]:
campaignLinkToFiles('http://www.presidency.ucsb.edu/2016_election.php', 2016)


17  files already existed, ignoring.
4  files created successfully:
./Campaign Speeches/2016/Donald-Trump.txt
./Campaign Speeches/2016/Ted-Cruz.txt
./Campaign Speeches/2016/Bernie-Sanders.txt
./Campaign Speeches/2016/Carly-Fiorina.txt

In [ ]:
campaignLinkToFiles('http://www.presidency.ucsb.edu/2008_election.php', 2008)

In [5]:
import pandas as pd
with open('./EDA/tweet_activity_metrics_reaIDonaldTrmp_20170425_20170509_en.csv', mode='r+') as f:
    twitter = f.read()
    df = pd.DataFrame(data=twitter.)
df.head()


---------------------------------------------------------------------------
PandasError                               Traceback (most recent call last)
<ipython-input-5-b92d887ed0de> in <module>()
      2 with open('./EDA/tweet_activity_metrics_reaIDonaldTrmp_20170425_20170509_en.csv', mode='r+') as f:
      3     twitter = f.read()
----> 4     df = pd.DataFrame(data=twitter)
      5 df.head()

C:\Users\nick\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
    343                                          copy=False)
    344             else:
--> 345                 raise PandasError('DataFrame constructor not properly called!')
    346 
    347         NDFrame.__init__(self, mgr, fastpath=True)

PandasError: DataFrame constructor not properly called!

In [ ]: