In [63]:
import json
import argparse
import os
import random

import gensim
import nltk

In [ ]:


In [3]:
#CUR_DIR = os.path.dirname(os.path.realpath(__file__))
CUR_DIR = "."
print(CUR_DIR)


.

In [4]:
# uses a ton of memory and takes a long time.
print('loading model')
model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(CUR_DIR, '../model/GoogleNews-vectors-negative300.bin'), binary=True)


loading model

In [5]:
# test out model
print(model.most_similar(positive = ['woman', 'king'], negative=['man']))


[(u'queen', 0.7118192315101624), (u'monarch', 0.6189674139022827), (u'princess', 0.5902431011199951), (u'crown_prince', 0.5499460697174072), (u'prince', 0.5377321839332581), (u'kings', 0.5236844420433044), (u'Queen_Consort', 0.5235946178436279), (u'queens', 0.5181134343147278), (u'sultan', 0.5098593235015869), (u'monarchy', 0.5087412595748901)]

In [88]:
# read in the shaw data
shaw_filename = os.path.join(CUR_DIR, '../data/shaw.json')
shaw_data = json.loads(open(shaw_filename).read())

In [89]:
# ensure it is read in
film = shaw_data[0]
print(film['title'])


Temple of the Red Lotus

In [90]:
def extract_most_similar(word, pos, neg):
    most_sim = word
    all_pos = pos + [word.lower()]
    #sim = model.most_similar(positive = [text.lower(), 'America'], negative=['China'])
    try:
        sim = model.most_similar(positive = all_pos, negative = neg, topn=5)
        # randomly chosen from the top made for more confusing titles
        #index = random.randint(0,4)
        index = 0
        most_sim = sim[index][0]
    except:
        print("fails on", word)
        
    
    most_sim_words = most_sim.split("_")
    most_sim_words = [w.capitalize() for w in most_sim_words]
    
    return " ".join(most_sim_words)
    
def create_new_title(film, country = 'American', minus = 'Chinese'):
    #country = 'America'
    
    text = nltk.word_tokenize(film['title'])
    tagged_text = nltk.pos_tag(text)
    new_text = []
    for text, tag in tagged_text:
        if tag[0] == 'N':
            #print(text)
            new_text.append(extract_most_similar(text, [country], [minus]))
            #print(sim)
        else:
            new_text.append(text)
    return " ".join(new_text)

In [97]:
def retitle_all(country = 'American'):
    new_titles = []
    out_name = country.lower() + '_titles.json'
    for film in shaw_data:
        new_title = create_new_title(film, country)
        new_titles.append({'old': film['title'], 'new': new_title})
    out_filename = os.path.join(CUR_DIR, '../data/titles', out_name)
    with open(out_filename, 'w') as outfile:
        json.dump(new_titles, outfile, sort_keys = False, indent = 2)
        


print(shaw_data[3]['title'])
n = create_new_title(shaw_data[3], 'German')
print(n)
n = create_new_title(shaw_data[3], 'American')
print(n)


The Winged Tiger
The Teutonic Rhino
The Wingless Panther

In [92]:
retitle_all('American')


('fails on', u'Swordswomen')
('fails on', u'Handlock')
('fails on', u'A')
('fails on', u'Kwangtung')
('fails on', u'A')
('fails on', u'Sabre')
('fails on', u'Kwangtung')
('fails on', u'Sabre')
('fails on', u'Sabre')
('fails on', u'Superfighters')
('fails on', u'Judgement')
('fails on', u'Inframan')
('fails on', u'Snowgirl')
('fails on', u'One-Armed')
('fails on', u'Swordmates')
('fails on', u'One-Armed')
('fails on', u'Chang-An')

In [93]:
retitle_all('German')


('fails on', u'Swordswomen')
('fails on', u'Handlock')
('fails on', u'A')
('fails on', u'Kwangtung')
('fails on', u'A')
('fails on', u'Sabre')
('fails on', u'Kwangtung')
('fails on', u'Sabre')
('fails on', u'Sabre')
('fails on', u'Superfighters')
('fails on', u'Judgement')
('fails on', u'Inframan')
('fails on', u'Snowgirl')
('fails on', u'One-Armed')
('fails on', u'Swordmates')
('fails on', u'One-Armed')
('fails on', u'Chang-An')

In [94]:
retitle_all('Australian')


('fails on', u'Swordswomen')
('fails on', u'Handlock')
('fails on', u'A')
('fails on', u'Kwangtung')
('fails on', u'A')
('fails on', u'Sabre')
('fails on', u'Kwangtung')
('fails on', u'Sabre')
('fails on', u'Sabre')
('fails on', u'Superfighters')
('fails on', u'Judgement')
('fails on', u'Inframan')
('fails on', u'Snowgirl')
('fails on', u'One-Armed')
('fails on', u'Swordmates')
('fails on', u'One-Armed')
('fails on', u'Chang-An')

In [95]:
retitle_all('English')


('fails on', u'Swordswomen')
('fails on', u'Handlock')
('fails on', u'A')
('fails on', u'Kwangtung')
('fails on', u'A')
('fails on', u'Sabre')
('fails on', u'Kwangtung')
('fails on', u'Sabre')
('fails on', u'Sabre')
('fails on', u'Superfighters')
('fails on', u'Judgement')
('fails on', u'Inframan')
('fails on', u'Snowgirl')
('fails on', u'One-Armed')
('fails on', u'Swordmates')
('fails on', u'One-Armed')
('fails on', u'Chang-An')

In [98]:
retitle_all('Turkish')


('fails on', u'Swordswomen')
('fails on', u'Handlock')
('fails on', u'A')
('fails on', u'Kwangtung')
('fails on', u'A')
('fails on', u'Sabre')
('fails on', u'Kwangtung')
('fails on', u'Sabre')
('fails on', u'Sabre')
('fails on', u'Superfighters')
('fails on', u'Judgement')
('fails on', u'Inframan')
('fails on', u'Snowgirl')
('fails on', u'One-Armed')
('fails on', u'Swordmates')
('fails on', u'One-Armed')
('fails on', u'Chang-An')

Below is just more random exploration of the model.


In [87]:
print(model.most_similar(positive=['China']))
print(model.most_similar(positive=['Chinese']))

print(model.most_similar(positive=['America']))
print(model.most_similar(positive=['American']))


[(u'Chinese', 0.7678080797195435), (u'Beijing', 0.7648463249206543), (u'Taiwan', 0.7081156969070435), (u'Chinas', 0.6899173855781555), (u'Shanghai', 0.6727433204650879), (u'Guangdong', 0.6721152067184448), (u'Hainan', 0.6360341310501099), (u'China\xe2_\u20ac_\u2122', 0.6301892399787903), (u'Hong_Kong', 0.624097466468811), (u'Shenzhen', 0.6239033937454224)]
[(u'China', 0.7678080797195435), (u'Taiwanese', 0.7435624599456787), (u'Beijing', 0.6758180856704712), (u'Zhang', 0.6612829566001892), (u'Li', 0.6447644829750061), (u'Zhou', 0.6439977288246155), (u'Japanese', 0.6422422528266907), (u'Xu', 0.6390734314918518), (u'Jiang', 0.6309375762939453), (u'Vietnamese', 0.6274450421333313)]
[(u'United_States', 0.6178410053253174), (u'American', 0.6116389632225037), (u'Amercia', 0.5794416666030884), (u'America\xe2_\u20ac_\u2122', 0.5631463527679443), (u'Americaand', 0.5606889724731445), (u'Europe', 0.557948887348175), (u'Unites_States', 0.5455595254898071), (u'nation', 0.5346957445144653), (u'world', 0.5279377698898315), (u'Latin_America', 0.5253671407699585)]
[(u'Amercian', 0.690601110458374), (u'America', 0.6116389632225037), (u'Amercan', 0.561011552810669), (u'U.S.', 0.530948281288147), (u'Amer_ican', 0.5277948975563049), (u'Americans', 0.5179344415664673), (u'African', 0.5149515271186829), (u'Ameican', 0.5099471807479858), (u'British', 0.5004253387451172), (u'Canadian', 0.4892980456352234)]

In [86]:
print(model.most_similar(positive=['Germany']))
print(model.most_similar(positive=['German']))


[(u'Austria', 0.7461062073707581), (u'German', 0.7178748846054077), (u'Germans', 0.6628648042678833), (u'Switzerland', 0.6506867408752441), (u'Hungary', 0.6504981517791748), (u'Germnay', 0.649348258972168), (u'Netherlands', 0.6437495946884155), (u'Cologne', 0.6430779099464417), (u'symbol_RSTI', 0.6389946341514587), (u'Annita_Kirsten', 0.634294867515564)]
[(u'Austrian', 0.742125391960144), (u'Germany', 0.7178749442100525), (u'Germans', 0.6854615807533264), (u'Bavarian', 0.6802921295166016), (u'Hungarian', 0.6666134595870972), (u'Dutch', 0.6491715908050537), (u'Belgian', 0.6436907052993774), (u'Czech', 0.6359938979148865), (u'Polish', 0.6300821304321289), (u'Romanian', 0.6186450123786926)]

In [85]:
print(model.most_similar(positive=['eagle', 'Chinese'], negative=['American']))


[(u'eagles', 0.5042415857315063), (u'zhu', 0.4769320785999298), (u'Liang_Wenchong', 0.46656209230422974), (u'Liang_Wen_chong', 0.4649501442909241), (u'Zhang_Lianwei', 0.45714402198791504), (u'Mount_Taishan', 0.4555661082267761), (u'bird_nest', 0.45367076992988586), (u'Chapchai', 0.45358550548553467), (u'Tseng', 0.45338496565818787), (u'Liang', 0.4519016742706299)]

In [83]:
print(model.most_similar(positive=['spoon', 'Chinese'], negative=['American']))


[(u'chopstick', 0.5625050663948059), (u'chopsticks', 0.5227457284927368), (u'soup_spoon', 0.5137594938278198), (u'crab_roe', 0.5098376870155334), (u'cai', 0.5072115659713745), (u'glutinous_rice_balls', 0.5057233572006226), (u'glutinous_rice_flour', 0.5033949613571167), (u'tiao', 0.49486514925956726), (u'char_siew', 0.48833203315734863), (u'fried_noodles', 0.4866980314254761)]

In [84]:
print(model.most_similar(positive=['buddha', 'American'], negative=['Chinese']))


[(u'God_fearin', 0.4062640070915222), (u'strutting_peacock', 0.4006575345993042), (u'devout_worshiper', 0.3999425768852234), (u'gaunt_bearded', 0.3902565538883209), (u'1Corinthians', 0.3865508437156677), (u'pantheist', 0.3863723874092102), (u'ZZ_Top_beard', 0.3850575089454651), (u'Hindoo', 0.38286125659942627), (u'Tradescantia', 0.38203465938568115), (u'idolater', 0.38102614879608154)]

In [25]:
text = nltk.word_tokenize(film['title'])
tagged_text = nltk.pos_tag(text)
print(tagged_text)

new_text = []
country = 'America'

for text, tag in tagged_text:
    if tag[0] == 'N':
        print(text)
        new_text.append(extract_most_similar([text.lower(), country], ['China']))
        print(sim)
    else:
        new_text.append(text)
print new_text
        
    #break


[(u'Temple', 'NN'), (u'of', 'IN'), (u'the', 'DT'), (u'Red', 'NNP'), (u'Lotus', 'NNP')]
Temple
[(u'black_eyed_susans', 0.3741646707057953), (u'Tradescantia', 0.3738454580307007), (u'trilliums', 0.3695296347141266), (u'Sanitizing_System', 0.36615896224975586), (u'trout_lilies', 0.36045974493026733), (u'ornamental_sweet_potato', 0.3595868647098541), (u'andromeda', 0.35832342505455017), (u'nasturtium', 0.3579365015029907), (u'hollyhock', 0.35730767250061035), (u'Helleborus', 0.3558405041694641)]
Red
[(u'black_eyed_susans', 0.3741646707057953), (u'Tradescantia', 0.3738454580307007), (u'trilliums', 0.3695296347141266), (u'Sanitizing_System', 0.36615896224975586), (u'trout_lilies', 0.36045974493026733), (u'ornamental_sweet_potato', 0.3595868647098541), (u'andromeda', 0.35832342505455017), (u'nasturtium', 0.3579365015029907), (u'hollyhock', 0.35730767250061035), (u'Helleborus', 0.3558405041694641)]
Lotus
[(u'black_eyed_susans', 0.3741646707057953), (u'Tradescantia', 0.3738454580307007), (u'trilliums', 0.3695296347141266), (u'Sanitizing_System', 0.36615896224975586), (u'trout_lilies', 0.36045974493026733), (u'ornamental_sweet_potato', 0.3595868647098541), (u'andromeda', 0.35832342505455017), (u'nasturtium', 0.3579365015029907), (u'hollyhock', 0.35730767250061035), (u'Helleborus', 0.3558405041694641)]
[u'Temples', u'of', u'the', u'Yellow', u'Black Eyed Susans']