In [ ]:
import gensim
from IPython.display import clear_output
import json
import re
import random
import time
import pickle
from IPython.core.display import display, HTML
import locale, os, socket
from numpy import int64
locale.setlocale(locale.LC_ALL, 'de_DE.utf-8')
if not os.path.exists('logs'):
os.makedirs('logs')
host = socket.gethostname()
if host == 'lyrik':
model_file = '/home/ramin/projects/ECO/src/python/modelbuilder/parsed_v3_valid.doc2vec'
print "U ARE ON LYRIK"
# ... move data and model into some convinient folder. so that model/parsed_v3_valid is there and
# NAIL_DATAFIELD_txt/parsed_v3/parsed_v3_valid.txt is there
else:
# local
model_file = '../../models/parsed_v3_valid.doc2vec'
if not os.path.isfile(model_file):
print "MODEL FILE IS NOT THERE. GO AND FIND IT"
In [ ]:
# 2 Build sentence list (each sentence needs at least 1 tag)
import socket, os
import gensim
host = socket.gethostname()
print(host)
model_file = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v4/parsed_v4_valid.doc2vec'
if host == 'lyrik':
filename = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v4/parsed_v4_valid.txt'
else:
# local
filename = '../../data/parsed_v3/parsed_v3_valid.txt' # parsed_v3_all.txt
if not os.path.isfile(filename):
print "TEXTFILE FILE IS NOT THERE"
sentences = []
from random import shuffle
for uid, line in enumerate(open(filename)):
if uid % 1000 == 0:
print(str(uid))
csv_split = line.split(';')
ls = gensim.models.doc2vec.LabeledSentence(words=csv_split[0], tags=['SENT_%s' % uid, csv_split[1]])
sentences.append(ls)
print len(sentences),'sentences'
In [ ]:
# 3 TRAINING OR LOADING the doc2vec model and save it
# ALTERNATIVE: LOAD THE MODEL IN THE NEXT CELL
# tutorial https://rare-technologies.com/doc2vec-tutorial/
# proposes shuffling or learning reate adjustment. we gonna do both
# in total 20 epochs
# took ca. 6.30 hours
# FOR SAFETY REASON, BUILD ONLY WHEN FLAG IS SET
train_model = True
if train_model:
model = gensim.models.Doc2Vec(alpha=0.025, min_alpha=0.025) # use fixed learning rate
print('building vocab')
model.build_vocab(sentences)
base_alpha = model.alpha
base_min_alpha = model.min_alpha
for mepoch in range(2):
model.alpha = base_alpha
model.min_alpha = base_min_alpha
for epoch in range(10):
print('epoch',mepoch * 10 + epoch)
model.train(sentences)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
shuffle(sentences)
# saving the model
model.save(model_file)
print 'model trained and saved'
else:
model = gensim.models.Doc2Vec.load(model_file)
print 'model loaded.',len(model.docvecs), 'vectors'
if len(sentences) != len(model.docvecs):
print 'something is fishy, unequal length: ',len(sentences),'sentences and',len(model.docvecs), 'vectors'
In [ ]:
# 4 Tiny helper functions
def print_word_list(wl):
print wl
str = ' '.join(wl)
pattern = re.compile('\s\W\s')
shift = 0
for ma in pattern.finditer(str):
str = str[:ma.start(0)-shift]+ma.group(0)[1:]+ str[ma.end(0)-shift:]
shift +=1
if str[-2] == ' ':
str = str[:-2] + str[-1:]
return str
def get_print(sentence_or_similar):
if type(sentence_or_similar) is gensim.models.doc2vec.LabeledSentence:
word_list = sentence_or_similar.words
elif type(sentence_or_similar) is int64 or type(sentence_or_similar) is int: # just an index
word_list = sentences[sentence_or_similar].words
else: # TaggedDocument class
word_list = sentences[int(sentence_or_similar[0][5:])][0]
return print_word_list(word_list)
def get_index_tag(sentence):
return sentence.tags[0]
def get_index(sentence_or_similar):
if type(sentence_or_similar) is gensim.models.doc2vec.LabeledSentence:
return int64(get_index_tag(sentence_or_similar)[5:])
else:
return int64(sentence_or_similar[0][5:])
def equal_word_lists(index1, index2):
wl1 = sentences[index1].words
wl2 = sentences[index2].words
if len(wl1) != len(wl2):
return False
else:
for i in range(len(wl1)):
if wl1[i] != wl2[i]:
return False
return True
def get_lab_sent_by_similar(similar):
print get_index(similar)
return sentences[get_index(similar)]
def get_similarity_by_index(index1, index2):
return model.docvecs.similarity(index1,index2)
# HTML Helper
def pack_into_elem(tag, clazz, content):
return '<' + tag + ' class="' + clazz + '"> ' + content+ ' </' + tag +'>'
pre = '''<style>
.act {font-weight: bold}
.i {color: grey}
.sim {color: orange}
.n {color: blue}
.p {color: red}
.r {color: green}
</style>'''
In [ ]:
get_print(4)
In [ ]:
# 5 Test: printing sentence 9 and getting the most similar ones.
do_tests = False
if do_tests:
test_sentence_index = 2639533
print get_print(test_sentence_index)
sims = model.docvecs.most_similar('SENT_'+str(test_sentence_index),topn = 30)
print 'similar sentence',len(sims)
print '\nSIMILAR SENTENCES\n'
for sim in sims:
print get_print(sim),sim
# 6 Test: iterate over similar sentences
# needs the sentences loaded (cell 2)
if do_tests:
index = 1983
# len(sentences)
# print sentences[index]
sentence = get_print(index)
print sentence
selected_indices = [index]
for sentence in range(10):
sims = model.docvecs.most_similar('SENT_'+str(index))
while True:
selected = random.choice(sims)
check_index = int(selected[0][5:])
if check_index not in selected_indices:
break
index = check_index
selected_indices.append(index)
print get_print(selected)
In [ ]:
# 7 Story Treenode class
PARENT = -1
QUIT = -2
NEXT = -3
num_similars = 20
num_random = 20
class LabSentTreeNode:
def __init__(self, labeledSentence, parent = None):
self.sentence = labeledSentence
self.sentence_index = get_index(self.sentence)
self.similars = self.get_similars()
self.randoms = self.get_randoms()
self.children = {} # index: SentenceTreeNode
self.selected_child = '' # None
self.parent = parent
def get_similars(self):
return model.docvecs.most_similar(get_index_tag(self.sentence),topn = num_similars)
def get_randoms(self):
randoms = []
for index in range(num_random):
rnd_sen = sentences[random.randint(0,len(sentences))]
randoms.append(rnd_sen)
return randoms
def print_options(self):
for index, sentence in enumerate(self.similars):
add = '(*)' if get_similar_index(sentence) in self.children else ''
print index, add, get_print(sentence), "%.3f" % sentence[1]
if self.parent:
print 'p: ', get_print(self.parent.sentence)
if self.sentence_index < len(sentences) - 2:
print 'n: ', get_print(sentences[self.sentence_index + 1])
for index,sentence in enumerate(self.randoms):
print 'r'+str(index) +": ", get_print(sentence)
def get_options_html(self):
html = ''
for index, sentence in enumerate(self.similars):
content = pack_into_elem('span','',get_print(sentence))
index = pack_into_elem('span','',str(index)+': ')
sentence_index = get_index(sentence)
index += '❗️' if sentence_index in added_sentences else ''
index_distance = locale.format('%d', abs(self.sentence_index - sentence_index), 1)
similar = pack_into_elem('span','sim',("%.3f" % sentence[1]) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
html += pack_into_elem('div', '', index + content + similar)
if self.parent:
content = pack_into_elem('span','',get_print(self.parent.sentence))
index = pack_into_elem('span','','P: ')
sentence_index = self.parent.sentence_index
index_distance = locale.format('%d', abs(self.sentence_index - sentence_index), 1)
similarity = get_similarity_by_index(self.sentence_index, self.parent.sentence_index)
similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
html += pack_into_elem('div', 'p', index + content + similar)
if self.sentence_index < len(sentences) - 2:
content = pack_into_elem('span','n',get_print(sentences[self.sentence_index + 1]))
index = pack_into_elem('span','','N: ')
sentence_index = self.sentence_index + 1
index += '❗️' if sentence_index in added_sentences else ''
index_distance = locale.format('%d', abs(self.sentence_index - sentence_index), 1)
similarity = get_similarity_by_index(self.sentence_index, self.sentence_index + 1)
similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
html += pack_into_elem('div', 'n', index + content + similar)
for index,sentence in enumerate(self.randoms):
content = pack_into_elem('span','',get_print(sentence))
index = pack_into_elem('span','','R'+str(index)+': ')
sentence_index = get_index(sentence)
index += '❗️' if sentence_index in added_sentences else ''
index_distance = locale.format('%d', abs(self.sentence_index - sentence_index), 1)
similarity = get_similarity_by_index(self.sentence_index, get_index(sentence))
similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
html += pack_into_elem('div', 'r', index + content + similar)
html += pack_into_elem('div', '', 'Q: Quit 💣')
return html
def get_sentence_html(self):
return pack_into_elem('p', 'act', get_print(self.sentence))
def select_child(self):
u_input = raw_input('Next child: ')
if u_input == 'p':
selected_index = PARENT
elif u_input == 'q':
return None
elif u_input == 'n':
selected_index = NEXT
elif u_input.startswith('r'):
selected_index = 100 + int(u_input[1:])
else:
try:
selected_index = int(u_input)
except ValueError:
return self
if selected_index >= 0 and selected_index < len(self.similars):
lab_sent = get_lab_sent_by_similar(self.similars[selected_index])
child = LabSentTreeNode(lab_sent, self)
self.children[u_input] = child
self.selected_child = u_input
return child
elif selected_index >= 100 and selected_index < len(self.randoms) + 100:
#print 'random sen'
child = LabSentTreeNode(self.randoms[selected_index - 100], self)
self.children[u_input] = child
self.selected_child = u_input
return child
elif selected_index == PARENT and self.parent:
return self.parent
elif selected_index == NEXT:
child = LabSentTreeNode(sentences[self.sentence_index + 1], self)
self.children[u_input] = child
self.selected_child = u_input
return child
# a weird number
return self
def toJSON(self):
children_toJSON = {}
for child_index in self.children:
children_toJSON[child_index] = self.children[child_index].toJSON()
return {'sentence':get_print(self.sentence),
'index':get_index(self.sentence),
'children':children_toJSON,
'selected_child':self.selected_child
}
In [ ]:
# 8 Story creator/log helper functions
def get_story(root_node):
act_sentence = root_node
story = []
while act_sentence:
story.append(get_print(act_sentence.sentence))
if act_sentence.selected_child != '':
act_sentence = act_sentence.children[act_sentence.selected_child]
else:
break
return story
def log_json(root_node):
with open('logs/log-'+str(number_logs)+'.json','w') as output:
output.write(json.dumps(root_node.toJSON(),indent=2))
def log_story(root_node):
story = get_story(root_node)
with open('logs/story-'+str(number_logs)+'.txt','w') as output:
for l in story:
output.write(l+'\n')
def print_story(root_node):
story = get_story(root_node)
for l in story:
print(l)
def dump_story(root_node):
with open('logs/story-'+str(number_logs)+'.dump','w') as dump_file:
pickle.dump(root_node,dump_file)
# one list comprehension!!!
def concordance(search_word,sen_range = 0):
sentence_indices = []
for index,sen in enumerate(sentences):
wl = sen[0]
for word in wl:
if word == search_word:
sentence_indices.append(index)
# print 'o',get_print(index)
for r in range(sen_range):
# prevent missing element
sentence_indices.append(index-r)
sentence_indices.append(index+r)
return sentence_indices
def concordance_result(sentences,sen_range = 0):
if len(sentences) < 10:
for sentence in sentences:
print sentence, get_print(sentence)
else:
print len(sentences),'sentences. 5 random ones:'
for i in range(5):
print sentence, get_print(random.choice(sentences))
def ask_word_input():
indices = []
while len(indices) == 0:
first_input = raw_input('First input: ')
indices = concordance(first_input)
print len(indices), 'Sentences with',first_input
sentence = sentences[random.choice(indices)]
return sentence
In [ ]:
# test... doesn't work
# print model.most_similar([sentence1,sentence2],[sentence3])
In [ ]:
# An Alternative method of creating stories.
# Start with a Start term and end End term.
# up to 'inbetweens' sentences inbetween will be added
import numpy as np
from gensim import matutils
inbetweens = 20
sentence1 = ask_word_input()
print get_print(sentence1)
print '-----'
sentence2 = ask_word_input()
print get_print(sentence2)
vec1 = model.docvecs[get_index_tag(sentence1)]
vec2 = model.docvecs[get_index_tag(sentence2)]
ps = ([np.linspace(vec1[index],vec2[index],inbetweens) for index,v in enumerate(vec1)])
ar = np.ndarray(shape=(inbetweens,100), dtype=float)
for ind,v in enumerate(ps):
for i in range(inbetweens):
ar[i][ind] = v[i]
story = []
topn = 3
model.docvecs.init_sims()
for index, v in enumerate(ar):
print (inbetweens - index - 1),
dists = np.dot(model.docvecs.doctag_syn0norm, v)
best = matutils.argsort(dists, topn, reverse=True)
ind = 0
while best[ind] in story:
ind +=1
if ind == topn:
break
if ind < topn:
# Hard sentence equality comparison
contains = [sen for sen in story if equal_word_lists(best[ind],sen)]
if not contains:
story.append(best[ind])
print '*',
clear_output()
for ind in story:
print get_print(ind)
In [ ]:
#### 9 Story creator
load_from_log = True
added_sentences = set()
number_logs = max(0,len(filter(lambda file_ : file_.endswith('dump'),
os.listdir("logs"))) - 1)
if load_from_log:
with open('logs/story-'+str(number_logs)+'.dump','r') as in_file:
root_node = pickle.load(in_file)
actual_node = root_node
while actual_node.selected_child != '':
added_sentences.add(actual_node.sentence_index)
actual_node = actual_node.children[actual_node.selected_child]
else:
sentence = ask_word_input()
root_node = LabSentTreeNode(sentence)
actual_node = root_node
print get_print(sentence)
while actual_node:
clear_output()
log_json(root_node)
log_story(root_node)
dump_story(root_node)
added_sentences.add(actual_node.sentence_index)
display(HTML(pre + actual_node.get_sentence_html() + actual_node.get_options_html()))
time.sleep(0.4)
actual_node = actual_node.select_child()
clear_output()
print '📖 ⭐ 📖'
print_story(root_node)
print '👋🏽'
In [ ]:
print model.docvecs.doctag_syn0norm
In [ ]:
# TRYING TO TRAIN MORE
# RUNS BUT DOESNT EXTEND THE MODEL FILE
# print len(model.docvecs)
# line = 'Therefore, if you tell me the truth, they are not going to reject what you say.'
# ls = gensim.models.doc2vec.LabeledSentence(words=line.split(), tags=['SENT_%s' % len(sentences)])
# sentences.append(ls)
# model.train([ls])
# print len(model.docvecs)
# print sentences[len(sentences)-1]
# get_similarity_by_index(len(sentences)-1,400)
In [ ]:
# 1 get the voices out of the browser. js > python :)
display(HTML('''
<script>
voices = window.speechSynthesis.getVoices();
eng_voices = Array()
for(var i=0; i < voices.length; i++){
if(voices[i].lang == "en-US") {
eng_voices.push(voices[i].name);
}
}
console.log(eng_voices);
IPython.notebook.kernel.execute('voices='+eng_voices);'''))
print voices
In [ ]:
pitch = 1.0
rate = 1.0
voice = 'Junior'
text = get_story(root_node)
var = 'var text= "'+' '.join(text) + '";'
html = '''
<script>
var synth = window.speechSynthesis;
var voices = synth.getVoices();
'''+ var +'''
var utterThis = new SpeechSynthesisUtterance(text);
var selectedOption = "'''+voice+'''"
for(i = 0; i < voices.length ; i++) {
if(voices[i].name === selectedOption) {
utterThis.voice = voices[i];
}
}
utterThis.pitch = '''+str(pitch)+''';
utterThis.rate = '''+str(rate)+''';
synth.speak(utterThis);
</script>
'''
display(HTML(html))
In [ ]:
search_word = 'motherfucker'
sentence_indices = []
le = len(sentences)
for index,sen in enumerate(sentences):
wl = sen[0]
for word in wl:
if word == search_word:
sentence_indices.append(index)
In [ ]:
print len(sentence_indices)
print(sentences[sentence_indices[2]])
print get_print(sentences[sentence_indices[2]])
In [ ]:
# not sure what crap output that is...
model.accuracy('questions-words.txt')