Analysis of Presidential speech and election data
This notebook scrapes The American Presidency Project and downloads the campagin speeches of all 2016 presidential candidates. It then builds a markov chain out each president's data capable of generating sentences in the style of their campaign speeches.
Scrape Campaign Speeches
In [6]:
import pandas as pd
import numpy as np
import requests
from lxml import html
from bs4 import BeautifulSoup
import markovify
import os.path
from datetime import datetime, timedelta
import calendar
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import seaborn as sns
import re
In [7]:
def getCandidateSpeechLinks(url):
allCandidatePage = requests.get(url)
allCandidatePageSoup = BeautifulSoup(allCandidatePage.text, 'lxml')
links={}
table = allCandidatePageSoup.find('table', width=680)
for area in table.findAll('td', class_='doctext'):
for a in area.findAll('a'):
if ('campaign' in a.text.lower()):
links[area.find('span', class_='roman').text] = a['href']
return links
def scrapeCampaignSpeechesToFile(url, path):
allSpeechPages = requests.get(url)
allSpeechSoup=BeautifulSoup(allSpeechPages.text, 'lxml')
root = 'http://www.presidency.ucsb.edu/'
table = allSpeechSoup.find('table', width=700)
links = []
for link in table.findAll('a'):
if('interview' not in link.text.lower()):
links.append(root+(link['href'])[3:])
speechPages = [requests.get(link , 'lxml')for link in links]
speechesSoup = [BeautifulSoup(speechPage.text, 'lxml') for speechPage in speechPages]
with open(path, "w+", encoding='utf-8') as outFile:
outFile.seek(0)
for i,speech in enumerate(speechesSoup):
text = speechesSoup[i].find('span', class_='displaytext').text.replace('.','. ')
text = re.sub('\[[a-zA-Z]*\]', ' ', text)
text = re.sub('[A-Z]+ [A-Z]+:', ' ', text)
text = re.sub('\w+:', ' ', text)
text = re.sub(r'[^\x00-\x7F]+',' ', text)
outFile.write(text +'\n')
def campaignLinkToFiles(url, year):
dataFolder = './Campaign Speeches/'+ str(year) +'/'
if not os.path.exists(dataFolder):
os.makedirs(dataFolder)
#Create the dictionary of each candidate's name and link to their campaign speech page
campaignSpeechLinkDict = getCandidateSpeechLinks(url)
root = 'http://www.presidency.ucsb.edu/'
new=0
existing=0
newpaths=[]
#Loops through the campagin speech links, puts each candidate's campagin speeches into individual files
for name, url in campaignSpeechLinkDict.items():
path = dataFolder + name.replace(' ', '-') + '.txt'
if not os.path.isfile(path):
new+=1
newpaths.append(path)
scrapeCampaignSpeechesToFile(root + url, path)
else:
existing+=1
#print some statistics
print(str(existing), ' files already existed, ignoring.')
print(str(new), ' files created successfully:')
for p in newpaths:
print(p)
In [8]:
campaignLinkToFiles('http://www.presidency.ucsb.edu/2016_election.php', 2016)
In [ ]:
campaignLinkToFiles('http://www.presidency.ucsb.edu/2008_election.php', 2008)
In [5]:
import pandas as pd
with open('./EDA/tweet_activity_metrics_reaIDonaldTrmp_20170425_20170509_en.csv', mode='r+') as f:
twitter = f.read()
df = pd.DataFrame(data=twitter.)
df.head()
In [ ]: