In [1]:
import os
import re
from collections import defaultdict
import nltk
from wordcloud import WordCloud
from scipy.misc import imread
import matplotlib.pyplot as plt
import random
import requests

In [2]:
def get_transcripts(output_dir):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    for episode_num in list(range(101,278)):
        url = 'http://www.chakoteya.net/NextGen/{}.htm'.format(episode_num)
        html = requests.get(url).content.decode('latin-1')
        with open(os.path.join(output_dir, 'script{}.htm'.format(episode_num)), 'wb+') as f:
            f.write(html.encode('utf-8'))

In [3]:
def get_all_lines(episode):
    lines = []
    # skip the airdate and other metadata
    for line in episode.split('<br>')[3:]:
        new_line = re.sub('\\r\\n', ' ', line)
        new_line = re.sub('&nbsp;', ' ', new_line)
        # Remove HTML tags
        new_line = re.sub('<[^>]+>', '', new_line)
        # Remove scene descriptions (in parens and brackets)
        new_line = re.sub('\([^)]+\)', '', new_line)
        new_line = re.sub('\s*\[[^\]]+\]', '', new_line)
        new_line = re.sub('&lt;.*', '', new_line)
        new_line = new_line.strip()
        if len(new_line) > 0:
            lines.append(new_line)
    return lines

def get_lines(script_dir, parse_file_function=get_all_lines):
    all_lines = []
    for episode_num in [101] + range(103, 278):
        try:
            with open(os.path.join(script_dir, 'script{}.htm'.format(episode_num)), 'r') as f:
                html = f.read()
                all_lines += parse_file_function(html)
        except:
            print "could not open episode", episode_num
    return all_lines

In [4]:
def get_two_word_phrases(lines, output_filename="two_word_phrases.txt"):
    phrase = defaultdict(defaultdict)
    previous = ""
    for line in lines:
        try:
            for word, tag in nltk.pos_tag(nltk.word_tokenize(line)):
                word = word.lower()
                if word in ["\'s", "n\'t", "\'ll", "\'ve", "\'m", "\'re", "\'d"]:
                    previous += word
                    continue
                if tag not in [',', '.', ':']:
                    if previous in phrase:
                        phrase[previous][word] += 1
                    else:
                        phrase[previous] = defaultdict(int)
                    previous = word
        except:
            print line

    sayings = []
    for first_word, second_dict in phrase.iteritems():
        for second_word in second_dict.iterkeys():
            if phrase[first_word][second_word] > 10:
                sayings.append([phrase[first_word][second_word], " ".join([first_word,second_word])])
    sayings.sort(reverse=True)
    with open(output_filename, 'w') as f:
        for v,k in sayings:
            f.write(k+' '+str(v)+'\n')

In [5]:
def get_word_counts(lines):
    all_words = defaultdict(int)
    for line in lines:
        try:
            tokens = nltk.pos_tag(nltk.word_tokenize(line))
            # TODO: I had tried to use stemming here to collate things like weapon and weapons,
            #       but couldn't really get it to work well.f
            for word, tag in tokens:
                if tag not in [',', '.', ':']:
                    word_stem = word.lower()
                    all_words[word_stem] += 1
        except:
            print line
    return all_words

In [6]:
def write_sorted_words(word_dict, out_filename="sorted_word_list.txt"):
    sorted_words = [(v,k) for (k,v) in word_dict.iteritems()]
    sorted_words.sort(reverse=True)
    words = [(k,v) for (v,k) in sorted_words]
    with open(out_filename, 'w') as f:
        for k,v in words:
            f.write(k+' '+str(v)+'\n')

In [7]:
def make_wordcloud(word_filename, mask, font, show_image=True, show_mask=False):

    # This function chooses colors randomly from the given palette.
    def tng_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        # Color palette for the ship's computer displays.
        palette = ["rgb(255, 153, 0)", "rgb(204, 153, 204)", "rgb(153, 153, 204)",
                   "rgb(204, 102, 102)", "rgb(255, 204, 153)", "rgb(153, 153, 255)",
                   "rgb(255, 153, 102)", "rgb(204, 102, 153)"]
        return palette[random.randint(0, len(palette)-1)]

    tng_mask = imread(mask)
    wc = WordCloud(background_color="black", max_words=250, mask=tng_mask, color_func=tng_color_func,
                   font_path=font, max_font_size=80, scale=4, width=2700, height=4800,
                   prefer_horizontal=0.5, ranks_only=False)

    word_list = []
    with open(word_filename, 'r') as f:
        for line in f:
            tokens = line.strip().split(' ')
            word = " ".join(tokens[:-1])
            count = int(tokens[-1].strip())
            word_list.append([word, count])

    # generate word cloud
    wc.generate_from_frequencies(word_list)

    # store to file
    wc.to_file("test.png")

    # show the resulting word cloud
    if show_image:
        plt.imshow(wc)
        plt.axis("off")
        if show_mask:
            plt.figure()
            plt.imshow(tng_mask, cmap=plt.cm.gray)
            plt.axis("off")
        plt.show()

In [ ]:
all_lines = get_lines("script_dir", get_all_lines)
word_counts = get_word_counts(all_lines)
write_sorted_words(word_counts, out_filename="all_cast_words.txt")
get_two_word_phrases(all_lines, out_filename="two_word_phrases.txt")

In [14]:
make_wordcloud("sorted_cloud_words.txt", "tng_emblem.png", "TNG_Title.ttf")

In [ ]: