In [ ]:
%matplotlib inline
In [ ]:
# pip install wordcloud nltk matplotlib pillow ipython[notebook]
# >>> nltk.download()
#
import os
import json
from os import path
from wordcloud import WordCloud
import nltk
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import matplotlib.pyplot as plt
In [ ]:
url = 'http://www.nytimes.com/2015/11/11/us/politics/transcript-republican-presidential-debate.html'
response = requests.get(url)
raw = BeautifulSoup(response.text, "html.parser")
paragraphs = raw.find_all('p', {'class': "story-body-text story-content", 'itemprop': "articleBody"})
speakers = [
'TRUMP',
'CARSON',
'RUBIO',
'CRUZ',
'BUSH',
'FIORINA',
'KASICH',
'PAUL',
'CAVUTO',
'BAKER',
'BARTIROMO',
'(UNKNOWN)',
]
IGNORE_PARAGRAPHS = [
'(APPLAUSE)',
'(LAUGHTER)',
'(CHEERING)',
'(CHEERING AND APPLAUSE)',
'(AUDIENCE REACTION)',
'(BOOING)',
'(CROSSTALK)',
'(COMMERCIAL BREAK)',
'(BELL RINGING)',
'(AUDIO GAP)',
'(MUSIC)',
'MORE',
]
pgs = defaultdict(list)
last_speaker = None
for p in paragraphs:
if p.text in IGNORE_PARAGRAPHS:
continue
p_has_speaker = False
for speaker in speakers:
if p.text.startswith('{}:'.format(speaker)):
last_speaker = speaker
p_has_speaker = True
pgs[speaker].append(p)
continue
if not p_has_speaker:
pgs[last_speaker].append(p)
In [ ]:
def pretty_print(common_words):
wps = [' {word}: {count}'.format(word=wp[0], count=wp[1]) for wp in common_words]
return ', '.join(wps)
In [ ]:
def generate_cloud(candidate):
SPEAKER = candidate
all_text = ' '.join([p.text for p in pgs[SPEAKER]])
all_text = all_text.replace(SPEAKER, '').replace('Dodd-Frank', 'DoddFrank').replace('Middle East', 'MiddleEast')
all_text = all_text.replace("don't", 'dont')
tokens = nltk.wordpunct_tokenize(all_text)
tagged_tokens = nltk.pos_tag([token.lower() for token in tokens])
fd = nltk.FreqDist(tagged_tokens)
interesting_classes = ['NN', 'NNP', 'NNS', 'VB', 'VBP', 'JJ', 'JJS', 'JJR', 'CD', 'NNPS', 'VBG']
stop_words = [tok[0][0] for tok in fd.most_common(10000) if tok[0][1] not in interesting_classes]
stop_words.extend(['be', 'u', 'going', 'have', 'are'])
stop_words.extend(['have', '’', 'don', 're', '—', 't', 'do'])
stop_words.extend(['1', '2', '5', 'm', 'i', 've', 's', '“', 'll'])
most_common_words = [(tok[0][0], tok[1]) for tok in fd.most_common(10000) if tok[0][0] not in stop_words]
most_common_words = most_common_words[:8]
wordcloud = WordCloud(stopwords=stop_words, width=1600, height=800).generate(all_text)
plt.figure(figsize=(24,12))
plt.imshow(wordcloud)
plt.axis("off")
plt.suptitle('{0}\'s Word Cloud'.format(SPEAKER.title()), fontsize=24, fontweight='bold')
plt.title(pretty_print(most_common_words), fontsize=14)
plt.show()
home_dir = os.path.expanduser('~')
output_file = os.path.join(
home_dir,
'Desktop',
'wordcloud',
'{0}_word_cloud.png'.format(SPEAKER.lower())
)
plt.savefig(output_file, bbox_inches='tight')
for candidate in speakers[:8]:
generate_cloud(candidate)