In [ ]:
import numpy as np
import math
from random import random

In [ ]:
# Library Gensim (Word2Vec examples)
### LOAD GOOGLE WORD2VEC... takes a few minutes
from gensim import models
# Load google's word2vec model
# languages
english = 'models/GoogleNews-vectors-negative300.bin.gz'
spanish = 'models/SBW-vectors-300-min5.bin.gz'
# load
model = models.Word2Vec.load_word2vec_format(english, binary=True)
model.init_sims(replace=True) # save memory
print 'model memory size, gb:', model.estimate_memory()['total']/(math.pow(1024,3))

Some helper functions that print and return an word arithmetic for:

  • equation (classic similarity)
  • equation (cosmul similarity)
  • word (cosmul similarity)

In [ ]:
# simple utility functions for printing
def print_similarities(similarity_list):
    for similarity in similarity_list:
        print similarity[0].ljust(18),similarity[1]
        
def similarities_p(pos=[],neg=[],topn=10,doPrint = True):
    if doPrint: 
        print "pos:",pos,"neg:",neg
    similarities = model.most_similar(pos,neg,topn)
    if doPrint: print_similarities(similarities)
    return similarities
    
def most_similar_cosmul_p(pos=[],neg=[],topn=10, doPrint = True):
    if doPrint: print "pos:",pos,"neg:",neg
    similarities = model.most_similar_cosmul(pos,neg,topn)
    if doPrint: print_similarities(similarities)    
    return similarities

def word_similar_cosmul_p(word,topn=10, doPrint = True):
    if doPrint: print "word:",word
    similarities = model.most_similar_cosmul([word],[],topn)
    if doPrint: print_similarities(similarities)    
    return similarities

In [ ]:
model['woman']

In [ ]:
most_similar_cosmul_p(['Spain','Berlin'],['Germany'],10)

In [ ]:
# DEMO

# 1 word arithmetic
#model.most_similar(positive=["woman",'girl']) # gunshot?
print "WORD ARITHMETIC"
model.most_similar(positive=["man"]) 
print model.most_similar(positive=["man",'boy']) 
# our helper fcts
similarities_and_p(['Germany'],['Spain'],topn=5)
most_similar_cosmul_p(['Germany'],['Spain'],topn=5)
# 2 similarity
print "\nSIMILARITY"
w1 = 'woman'
w2 = 'man'
print model.similarity(w1,w2)

# 4 non-matching
print "\nNON-MATCHING WORD"
print model.doesnt_match("breakfast cereal dinner lunch".split())

# 5 feature vector
vector = model['computer']
#print vector

# 6 similar by vector
vector[np.random.randint(0,300)] += random() * 0.5 - 0.05

print model.similar_by_vector(vector)

In [ ]:
# 'of' is not in the model

In [ ]:
#text1.concordance("monstrous")

In [ ]:
from math import sqrt
b = model['o']
#print len(b)
#print b
print np.linalg.norm(b)

In [ ]:
# Library TextBlob
import textblob
# READ A TEXT
from textblob import TextBlob
book = open('../sci-fi-book.txt','r').read()
book_blob = TextBlob(book)

In [ ]:
def tag(word):
    return TextBlob(word).tags[0]
#print tag('win')

In [ ]:
print "TAGS (grammar tree):"
blob_tags = book_blob.tags
print blob_tags

print 'NOUN PHRASES'
blob_noun_phrases = book_blob.noun_phrases
print blob_noun_phrases

In [ ]:
print 'GET VERBS'

def get_tag(tags, query_tag):
    result = []
    for tag in tags:
        if tag[1] == query_tag:
            result.append(tag)
    return result

#print get_tag(blob_tags,'VBN')

#print book_blob.sentences

print "SENTENCE:"
sentence = book_blob.sentences[np.random.randint(0,len(book_blob.sentences))]
print sentence
print 'SENTIMENT'
print(sentence.sentiment)

In [ ]:
print "SOME NLTK"
from nltk.corpus import treebank
import nltk

book_nltk_text = nltk.Text(book)
print book

print '\n',sentence
words = sentence.words
print words
# not working
#treebank.parsed_sents(sentence)
print sentence.pos_tags
nnps = get_tag(sentence.pos_tags,'NNP')

#for tree in parser.parse(words):
print nnps[0][0]
print book_nltk_text.concordance('yes')

In [ ]:
word = 'angry'
print word,':'
print 'similars:'
p_similarities(model.most_similar([word],topn=20))
start_word = textblob.Word(word)
if len(start_word.synsets) != 0:
    print('synsets: ' + str(start_word.synsets))
    print('definitions: ' + str(start_word.definitions))
    synset_index = 0
    synset = start_word.synsets[synset_index]
    print('lemma_names',synset.lemmas())
    print('hypernyms:',synset.hypernyms())
    print('hyponyms:',synset.hyponyms())
    print('holonyms: ' + str(synset.member_holonyms()))
    print('meronyms: ' + str(synset.part_meronyms()))

DEMO

use nltk std chatbots to get an answer then replace some words that by similar or with more positive sentiment display old and new text with color showing the change. black: same green: similar red: higher sentiment

TODO: find words that indicate in which conceptual topic the sentence is


In [ ]:
from nltk.chat import eliza, iesha, rude,suntsu
from textblob import TextBlob

#eliza.demo()
sentence = 'how is it that all birds are gay'
eli = eliza.eliza_chatbot
print 'eli',eli.respond(sentence)

print rude.rude_chatbot.respond(sentence)
print iesha.iesha_chatbot.respond(sentence)
print suntsu.suntsu_chatbot.respond(sentence)

In [ ]:
# Take the suntsu response and change all words that have > .85 similarity
from IPython.core.display import display, HTML
#import progressbar
response = suntsu.suntsu_chatbot.respond('how is it that all birds are gay')

suntsu_response_blob = TextBlob(response)
print suntsu_response_blob
suntsu_words = suntsu_response_blob.words
print suntsu_response_blob.sentiment
reconfig = []
reconfig_how = []
i = 0 
#bar = progressbar.ProgressBar(max_value=len(suntsu_words),redirect_stdout=True)

for word in suntsu_response_blob.words:
    if word in model:
        similar = most_similar_cosmul_p(word,doPrint = False)
        suntsu_response_blob.words[i] = similar[0][0]    
        positiver = most_similar_cosmul_p([word,'king'],topn = 1,doPrint=False)[0][0]
        print word,similar[0], positiver
        #if tag(positiver) == tag(word):
        if model.similarity(positiver,word) > 0.7:
            reconfig.append(" ".join(positiver.split("_")))
            reconfig_how.append(1)
        elif similar[0][1] > 0.85:
            reconfig.append(" ".join(similar[0][0].split("_")))
            reconfig_how.append(2)
        else:
            reconfig.append(word)
            reconfig_how.append(0)
        print '>>>',reconfig[-1],reconfig_how[-1]
    i += 1
retune = TextBlob(" ".join(reconfig))
print retune
print retune.sentiment

i = 0 
retune_html = ''
span_temp = '<span style="color:z;">x </span>'
for w in reconfig:
    print suntsu_response_blob.words[i],w,reconfig_how[i]
    if reconfig_how[i] == 0:
        retune_html +=  span_temp.replace('z','black').replace('x',w)
    elif reconfig_how[i] == 1:
        retune_html +=  span_temp.replace('z','red').replace('x',w)
    else:
        retune_html +=  span_temp.replace('z','green').replace('x',w)
    #print retune_html
    i += 1

display(HTML('<h1>'+response+'</h1><div>Sentiment:'+str(suntsu_response_blob.sentiment)+'</div>'))
display(HTML('<h1>'+retune_html+'</h1><div>Sentiment:'+str(retune.sentiment)+'</div>'))

In [ ]:
#print len(wiki.words)   
#print wiki.words
#print wiki.tags
#noun_phrases = wiki.noun_phrases
#print noun_phrases
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
print testimonial.sentiment
testimonial.sentiment.polarity

In [ ]:
from nltk import Text
#print book
book_Text = Text(book)
print book_Text
#book_Text.concordance("l")

In [ ]:


In [ ]:
for np in noun_phrases:
    print np

In [ ]:
model.most_similar(positive=['england', 'paris'], negative=['france'])
model.most_similar(positive=['hate',], negative=['love'])
model.most_similar(positive=['love',], negative=['hate'])

synset.lemmas()

#extension from love
#tell a story with code.

In [ ]: