In [ ]:
import numpy as np
import math
from random import random
In [ ]:
# Library Gensim (Word2Vec examples)
### LOAD GOOGLE WORD2VEC... takes a few minutes
from gensim import models
# Load google's word2vec model
# languages
english = 'models/GoogleNews-vectors-negative300.bin.gz'
spanish = 'models/SBW-vectors-300-min5.bin.gz'
# load
model = models.Word2Vec.load_word2vec_format(english, binary=True)
model.init_sims(replace=True) # save memory
print 'model memory size, gb:', model.estimate_memory()['total']/(math.pow(1024,3))
Some helper functions that print and return an word arithmetic for:
In [ ]:
# simple utility functions for printing
def print_similarities(similarity_list):
for similarity in similarity_list:
print similarity[0].ljust(18),similarity[1]
def similarities_p(pos=[],neg=[],topn=10,doPrint = True):
if doPrint:
print "pos:",pos,"neg:",neg
similarities = model.most_similar(pos,neg,topn)
if doPrint: print_similarities(similarities)
return similarities
def most_similar_cosmul_p(pos=[],neg=[],topn=10, doPrint = True):
if doPrint: print "pos:",pos,"neg:",neg
similarities = model.most_similar_cosmul(pos,neg,topn)
if doPrint: print_similarities(similarities)
return similarities
def word_similar_cosmul_p(word,topn=10, doPrint = True):
if doPrint: print "word:",word
similarities = model.most_similar_cosmul([word],[],topn)
if doPrint: print_similarities(similarities)
return similarities
In [ ]:
model['woman']
In [ ]:
most_similar_cosmul_p(['Spain','Berlin'],['Germany'],10)
In [ ]:
# DEMO
# 1 word arithmetic
#model.most_similar(positive=["woman",'girl']) # gunshot?
print "WORD ARITHMETIC"
model.most_similar(positive=["man"])
print model.most_similar(positive=["man",'boy'])
# our helper fcts
similarities_and_p(['Germany'],['Spain'],topn=5)
most_similar_cosmul_p(['Germany'],['Spain'],topn=5)
# 2 similarity
print "\nSIMILARITY"
w1 = 'woman'
w2 = 'man'
print model.similarity(w1,w2)
# 4 non-matching
print "\nNON-MATCHING WORD"
print model.doesnt_match("breakfast cereal dinner lunch".split())
# 5 feature vector
vector = model['computer']
#print vector
# 6 similar by vector
vector[np.random.randint(0,300)] += random() * 0.5 - 0.05
print model.similar_by_vector(vector)
In [ ]:
# 'of' is not in the model
In [ ]:
#text1.concordance("monstrous")
In [ ]:
from math import sqrt
b = model['o']
#print len(b)
#print b
print np.linalg.norm(b)
In [ ]:
# Library TextBlob
import textblob
# READ A TEXT
from textblob import TextBlob
book = open('../sci-fi-book.txt','r').read()
book_blob = TextBlob(book)
In [ ]:
def tag(word):
return TextBlob(word).tags[0]
#print tag('win')
In [ ]:
print "TAGS (grammar tree):"
blob_tags = book_blob.tags
print blob_tags
print 'NOUN PHRASES'
blob_noun_phrases = book_blob.noun_phrases
print blob_noun_phrases
In [ ]:
print 'GET VERBS'
def get_tag(tags, query_tag):
result = []
for tag in tags:
if tag[1] == query_tag:
result.append(tag)
return result
#print get_tag(blob_tags,'VBN')
#print book_blob.sentences
print "SENTENCE:"
sentence = book_blob.sentences[np.random.randint(0,len(book_blob.sentences))]
print sentence
print 'SENTIMENT'
print(sentence.sentiment)
In [ ]:
print "SOME NLTK"
from nltk.corpus import treebank
import nltk
book_nltk_text = nltk.Text(book)
print book
print '\n',sentence
words = sentence.words
print words
# not working
#treebank.parsed_sents(sentence)
print sentence.pos_tags
nnps = get_tag(sentence.pos_tags,'NNP')
#for tree in parser.parse(words):
print nnps[0][0]
print book_nltk_text.concordance('yes')
In [ ]:
word = 'angry'
print word,':'
print 'similars:'
p_similarities(model.most_similar([word],topn=20))
start_word = textblob.Word(word)
if len(start_word.synsets) != 0:
print('synsets: ' + str(start_word.synsets))
print('definitions: ' + str(start_word.definitions))
synset_index = 0
synset = start_word.synsets[synset_index]
print('lemma_names',synset.lemmas())
print('hypernyms:',synset.hypernyms())
print('hyponyms:',synset.hyponyms())
print('holonyms: ' + str(synset.member_holonyms()))
print('meronyms: ' + str(synset.part_meronyms()))
In [ ]:
from nltk.chat import eliza, iesha, rude,suntsu
from textblob import TextBlob
#eliza.demo()
sentence = 'how is it that all birds are gay'
eli = eliza.eliza_chatbot
print 'eli',eli.respond(sentence)
print rude.rude_chatbot.respond(sentence)
print iesha.iesha_chatbot.respond(sentence)
print suntsu.suntsu_chatbot.respond(sentence)
In [ ]:
# Take the suntsu response and change all words that have > .85 similarity
from IPython.core.display import display, HTML
#import progressbar
response = suntsu.suntsu_chatbot.respond('how is it that all birds are gay')
suntsu_response_blob = TextBlob(response)
print suntsu_response_blob
suntsu_words = suntsu_response_blob.words
print suntsu_response_blob.sentiment
reconfig = []
reconfig_how = []
i = 0
#bar = progressbar.ProgressBar(max_value=len(suntsu_words),redirect_stdout=True)
for word in suntsu_response_blob.words:
if word in model:
similar = most_similar_cosmul_p(word,doPrint = False)
suntsu_response_blob.words[i] = similar[0][0]
positiver = most_similar_cosmul_p([word,'king'],topn = 1,doPrint=False)[0][0]
print word,similar[0], positiver
#if tag(positiver) == tag(word):
if model.similarity(positiver,word) > 0.7:
reconfig.append(" ".join(positiver.split("_")))
reconfig_how.append(1)
elif similar[0][1] > 0.85:
reconfig.append(" ".join(similar[0][0].split("_")))
reconfig_how.append(2)
else:
reconfig.append(word)
reconfig_how.append(0)
print '>>>',reconfig[-1],reconfig_how[-1]
i += 1
retune = TextBlob(" ".join(reconfig))
print retune
print retune.sentiment
i = 0
retune_html = ''
span_temp = '<span style="color:z;">x </span>'
for w in reconfig:
print suntsu_response_blob.words[i],w,reconfig_how[i]
if reconfig_how[i] == 0:
retune_html += span_temp.replace('z','black').replace('x',w)
elif reconfig_how[i] == 1:
retune_html += span_temp.replace('z','red').replace('x',w)
else:
retune_html += span_temp.replace('z','green').replace('x',w)
#print retune_html
i += 1
display(HTML('<h1>'+response+'</h1><div>Sentiment:'+str(suntsu_response_blob.sentiment)+'</div>'))
display(HTML('<h1>'+retune_html+'</h1><div>Sentiment:'+str(retune.sentiment)+'</div>'))
In [ ]:
#print len(wiki.words)
#print wiki.words
#print wiki.tags
#noun_phrases = wiki.noun_phrases
#print noun_phrases
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
print testimonial.sentiment
testimonial.sentiment.polarity
In [ ]:
from nltk import Text
#print book
book_Text = Text(book)
print book_Text
#book_Text.concordance("l")
In [ ]:
In [ ]:
for np in noun_phrases:
print np
In [ ]:
model.most_similar(positive=['england', 'paris'], negative=['france'])
model.most_similar(positive=['hate',], negative=['love'])
model.most_similar(positive=['love',], negative=['hate'])
synset.lemmas()
#extension from love
#tell a story with code.
In [ ]: