In [63]:
import json
import argparse
import os
import random
import gensim
import nltk
In [ ]:
In [3]:
#CUR_DIR = os.path.dirname(os.path.realpath(__file__))
CUR_DIR = "."
print(CUR_DIR)
In [4]:
# uses a ton of memory and takes a long time.
print('loading model')
model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(CUR_DIR, '../model/GoogleNews-vectors-negative300.bin'), binary=True)
In [5]:
# test out model
print(model.most_similar(positive = ['woman', 'king'], negative=['man']))
In [88]:
# read in the shaw data
shaw_filename = os.path.join(CUR_DIR, '../data/shaw.json')
shaw_data = json.loads(open(shaw_filename).read())
In [89]:
# ensure it is read in
film = shaw_data[0]
print(film['title'])
In [90]:
def extract_most_similar(word, pos, neg):
most_sim = word
all_pos = pos + [word.lower()]
#sim = model.most_similar(positive = [text.lower(), 'America'], negative=['China'])
try:
sim = model.most_similar(positive = all_pos, negative = neg, topn=5)
# randomly chosen from the top made for more confusing titles
#index = random.randint(0,4)
index = 0
most_sim = sim[index][0]
except:
print("fails on", word)
most_sim_words = most_sim.split("_")
most_sim_words = [w.capitalize() for w in most_sim_words]
return " ".join(most_sim_words)
def create_new_title(film, country = 'American', minus = 'Chinese'):
#country = 'America'
text = nltk.word_tokenize(film['title'])
tagged_text = nltk.pos_tag(text)
new_text = []
for text, tag in tagged_text:
if tag[0] == 'N':
#print(text)
new_text.append(extract_most_similar(text, [country], [minus]))
#print(sim)
else:
new_text.append(text)
return " ".join(new_text)
In [97]:
def retitle_all(country = 'American'):
new_titles = []
out_name = country.lower() + '_titles.json'
for film in shaw_data:
new_title = create_new_title(film, country)
new_titles.append({'old': film['title'], 'new': new_title})
out_filename = os.path.join(CUR_DIR, '../data/titles', out_name)
with open(out_filename, 'w') as outfile:
json.dump(new_titles, outfile, sort_keys = False, indent = 2)
print(shaw_data[3]['title'])
n = create_new_title(shaw_data[3], 'German')
print(n)
n = create_new_title(shaw_data[3], 'American')
print(n)
In [92]:
retitle_all('American')
In [93]:
retitle_all('German')
In [94]:
retitle_all('Australian')
In [95]:
retitle_all('English')
In [98]:
retitle_all('Turkish')
Below is just more random exploration of the model.
In [87]:
print(model.most_similar(positive=['China']))
print(model.most_similar(positive=['Chinese']))
print(model.most_similar(positive=['America']))
print(model.most_similar(positive=['American']))
In [86]:
print(model.most_similar(positive=['Germany']))
print(model.most_similar(positive=['German']))
In [85]:
print(model.most_similar(positive=['eagle', 'Chinese'], negative=['American']))
In [83]:
print(model.most_similar(positive=['spoon', 'Chinese'], negative=['American']))
In [84]:
print(model.most_similar(positive=['buddha', 'American'], negative=['Chinese']))
In [25]:
text = nltk.word_tokenize(film['title'])
tagged_text = nltk.pos_tag(text)
print(tagged_text)
new_text = []
country = 'America'
for text, tag in tagged_text:
if tag[0] == 'N':
print(text)
new_text.append(extract_most_similar([text.lower(), country], ['China']))
print(sim)
else:
new_text.append(text)
print new_text
#break