In [ ]:
%matplotlib inline

In [ ]:
# pip install wordcloud nltk matplotlib pillow ipython[notebook]
# >>> nltk.download()
#
import os
import json
from os import path
from wordcloud import WordCloud
import nltk
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import matplotlib.pyplot as plt

In [ ]:
url = 'http://www.nytimes.com/2015/11/11/us/politics/transcript-republican-presidential-debate.html'
response = requests.get(url)
raw = BeautifulSoup(response.text, "html.parser")
paragraphs = raw.find_all('p', {'class': "story-body-text story-content", 'itemprop': "articleBody"})

speakers = [
    'TRUMP',
    'CARSON',
    'RUBIO',
    'CRUZ',
    'BUSH',
    'FIORINA',
    'KASICH',
    'PAUL',
    'CAVUTO',
    'BAKER',
    'BARTIROMO',
    '(UNKNOWN)',
]

IGNORE_PARAGRAPHS = [
    '(APPLAUSE)',
    '(LAUGHTER)',
    '(CHEERING)',
    '(CHEERING AND APPLAUSE)',
    '(AUDIENCE REACTION)',
    '(BOOING)',
    '(CROSSTALK)',
    '(COMMERCIAL BREAK)',
    '(BELL RINGING)',
    '(AUDIO GAP)',
    '(MUSIC)',
    'MORE',
]

pgs = defaultdict(list)

last_speaker = None

for p in paragraphs:
    if p.text in IGNORE_PARAGRAPHS:
        continue
    
    p_has_speaker = False
    for speaker in speakers:
        if p.text.startswith('{}:'.format(speaker)):
            last_speaker = speaker
            p_has_speaker = True 
            pgs[speaker].append(p)
            continue
    if not p_has_speaker:
        pgs[last_speaker].append(p)

In [ ]:
def pretty_print(common_words):
    wps = ['  {word}: {count}'.format(word=wp[0], count=wp[1]) for wp in common_words]
    return ', '.join(wps)

In [ ]:
def generate_cloud(candidate):
    SPEAKER = candidate
    all_text = ' '.join([p.text for p in pgs[SPEAKER]])
    all_text = all_text.replace(SPEAKER, '').replace('Dodd-Frank', 'DoddFrank').replace('Middle East', 'MiddleEast')
    all_text = all_text.replace("don't", 'dont')
    
    tokens = nltk.wordpunct_tokenize(all_text)
    tagged_tokens = nltk.pos_tag([token.lower() for token in tokens])
    fd = nltk.FreqDist(tagged_tokens)

    interesting_classes = ['NN', 'NNP', 'NNS', 'VB', 'VBP', 'JJ', 'JJS', 'JJR', 'CD', 'NNPS', 'VBG']
    stop_words = [tok[0][0] for tok in fd.most_common(10000)  if  tok[0][1] not in interesting_classes]
    stop_words.extend(['be', 'u', 'going', 'have', 'are'])

    stop_words.extend(['have', '’', 'don', 're', '—', 't', 'do'])
    stop_words.extend(['1', '2', '5', 'm', 'i', 've', 's', '“', 'll'])
    
    most_common_words = [(tok[0][0], tok[1]) for tok in fd.most_common(10000)  if  tok[0][0] not in stop_words]
    most_common_words = most_common_words[:8]

    wordcloud = WordCloud(stopwords=stop_words, width=1600, height=800).generate(all_text)
    plt.figure(figsize=(24,12))
    plt.imshow(wordcloud)
    plt.axis("off")

    plt.suptitle('{0}\'s Word Cloud'.format(SPEAKER.title()), fontsize=24, fontweight='bold')
    plt.title(pretty_print(most_common_words), fontsize=14)

    plt.show()
    
    home_dir = os.path.expanduser('~')
    output_file = os.path.join(
        home_dir,
        'Desktop',
        'wordcloud',
        '{0}_word_cloud.png'.format(SPEAKER.lower())
    )

    plt.savefig(output_file, bbox_inches='tight')

for candidate in speakers[:8]:
    generate_cloud(candidate)