In [1]:
import os
import re
from collections import defaultdict
import nltk
from wordcloud import WordCloud
from scipy.misc import imread
import matplotlib.pyplot as plt
import random
import requests
In [2]:
def get_transcripts(output_dir):
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for episode_num in list(range(101,278)):
url = 'http://www.chakoteya.net/NextGen/{}.htm'.format(episode_num)
html = requests.get(url).content.decode('latin-1')
with open(os.path.join(output_dir, 'script{}.htm'.format(episode_num)), 'wb+') as f:
f.write(html.encode('utf-8'))
In [3]:
def get_all_lines(episode):
lines = []
# skip the airdate and other metadata
for line in episode.split('<br>')[3:]:
new_line = re.sub('\\r\\n', ' ', line)
new_line = re.sub(' ', ' ', new_line)
# Remove HTML tags
new_line = re.sub('<[^>]+>', '', new_line)
# Remove scene descriptions (in parens and brackets)
new_line = re.sub('\([^)]+\)', '', new_line)
new_line = re.sub('\s*\[[^\]]+\]', '', new_line)
new_line = re.sub('<.*', '', new_line)
new_line = new_line.strip()
if len(new_line) > 0:
lines.append(new_line)
return lines
def get_lines(script_dir, parse_file_function=get_all_lines):
all_lines = []
for episode_num in [101] + range(103, 278):
try:
with open(os.path.join(script_dir, 'script{}.htm'.format(episode_num)), 'r') as f:
html = f.read()
all_lines += parse_file_function(html)
except:
print "could not open episode", episode_num
return all_lines
In [4]:
def get_two_word_phrases(lines, output_filename="two_word_phrases.txt"):
phrase = defaultdict(defaultdict)
previous = ""
for line in lines:
try:
for word, tag in nltk.pos_tag(nltk.word_tokenize(line)):
word = word.lower()
if word in ["\'s", "n\'t", "\'ll", "\'ve", "\'m", "\'re", "\'d"]:
previous += word
continue
if tag not in [',', '.', ':']:
if previous in phrase:
phrase[previous][word] += 1
else:
phrase[previous] = defaultdict(int)
previous = word
except:
print line
sayings = []
for first_word, second_dict in phrase.iteritems():
for second_word in second_dict.iterkeys():
if phrase[first_word][second_word] > 10:
sayings.append([phrase[first_word][second_word], " ".join([first_word,second_word])])
sayings.sort(reverse=True)
with open(output_filename, 'w') as f:
for v,k in sayings:
f.write(k+' '+str(v)+'\n')
In [5]:
def get_word_counts(lines):
all_words = defaultdict(int)
for line in lines:
try:
tokens = nltk.pos_tag(nltk.word_tokenize(line))
# TODO: I had tried to use stemming here to collate things like weapon and weapons,
# but couldn't really get it to work well.f
for word, tag in tokens:
if tag not in [',', '.', ':']:
word_stem = word.lower()
all_words[word_stem] += 1
except:
print line
return all_words
In [6]:
def write_sorted_words(word_dict, out_filename="sorted_word_list.txt"):
sorted_words = [(v,k) for (k,v) in word_dict.iteritems()]
sorted_words.sort(reverse=True)
words = [(k,v) for (v,k) in sorted_words]
with open(out_filename, 'w') as f:
for k,v in words:
f.write(k+' '+str(v)+'\n')
In [7]:
def make_wordcloud(word_filename, mask, font, show_image=True, show_mask=False):
# This function chooses colors randomly from the given palette.
def tng_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
# Color palette for the ship's computer displays.
palette = ["rgb(255, 153, 0)", "rgb(204, 153, 204)", "rgb(153, 153, 204)",
"rgb(204, 102, 102)", "rgb(255, 204, 153)", "rgb(153, 153, 255)",
"rgb(255, 153, 102)", "rgb(204, 102, 153)"]
return palette[random.randint(0, len(palette)-1)]
tng_mask = imread(mask)
wc = WordCloud(background_color="black", max_words=250, mask=tng_mask, color_func=tng_color_func,
font_path=font, max_font_size=80, scale=4, width=2700, height=4800,
prefer_horizontal=0.5, ranks_only=False)
word_list = []
with open(word_filename, 'r') as f:
for line in f:
tokens = line.strip().split(' ')
word = " ".join(tokens[:-1])
count = int(tokens[-1].strip())
word_list.append([word, count])
# generate word cloud
wc.generate_from_frequencies(word_list)
# store to file
wc.to_file("test.png")
# show the resulting word cloud
if show_image:
plt.imshow(wc)
plt.axis("off")
if show_mask:
plt.figure()
plt.imshow(tng_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
In [ ]:
all_lines = get_lines("script_dir", get_all_lines)
word_counts = get_word_counts(all_lines)
write_sorted_words(word_counts, out_filename="all_cast_words.txt")
get_two_word_phrases(all_lines, out_filename="two_word_phrases.txt")
In [14]:
make_wordcloud("sorted_cloud_words.txt", "tng_emblem.png", "TNG_Title.ttf")
In [ ]: