In [ ]:
import gensim
from IPython.display import clear_output
import json
import re
import random
import time
import pickle
from IPython.core.display import display, HTML
import locale, os, socket
from numpy import int64
locale.setlocale(locale.LC_ALL, 'de_DE.utf-8')

if not os.path.exists('logs'):
    os.makedirs('logs')

host = socket.gethostname() 
if host == 'lyrik':
    model_file = '/home/ramin/projects/ECO/src/python/modelbuilder/parsed_v3_valid.doc2vec'
    print "U ARE ON LYRIK"
    # ... move data and model into some convinient folder. so that model/parsed_v3_valid is there and
    # NAIL_DATAFIELD_txt/parsed_v3/parsed_v3_valid.txt is there
else:
    # local
    model_file = '../../models/parsed_v3_valid.doc2vec'    

if not os.path.isfile(model_file):
    print "MODEL FILE IS NOT THERE. GO AND FIND IT"

In [ ]:
# 2 Build sentence list (each sentence needs at least 1 tag)
import socket, os
import gensim

host = socket.gethostname() 
print(host)
model_file = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v4/parsed_v4_valid.doc2vec'

if host == 'lyrik':
    filename = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v4/parsed_v4_valid.txt'
else:
    # local
    filename = '../../data/parsed_v3/parsed_v3_valid.txt' # parsed_v3_all.txt

if not os.path.isfile(filename):
    print "TEXTFILE FILE IS NOT THERE"    

sentences = []
from random import shuffle


for uid, line in enumerate(open(filename)):
    if uid % 1000 == 0:
        print(str(uid))
    csv_split = line.split(';')
    ls = gensim.models.doc2vec.LabeledSentence(words=csv_split[0], tags=['SENT_%s' % uid, csv_split[1]])
    sentences.append(ls)
print len(sentences),'sentences'

In [ ]:
# 3 TRAINING OR LOADING the doc2vec model and save it
# ALTERNATIVE: LOAD THE MODEL IN THE NEXT CELL

# tutorial https://rare-technologies.com/doc2vec-tutorial/
# proposes shuffling or learning reate adjustment. we gonna do both
# in total 20 epochs
# took ca. 6.30 hours

# FOR SAFETY REASON, BUILD ONLY WHEN FLAG IS SET

train_model = True

if train_model:
    model = gensim.models.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
    print('building vocab') 
    model.build_vocab(sentences)

    base_alpha = model.alpha
    base_min_alpha = model.min_alpha

    for mepoch in range(2):
        model.alpha = base_alpha 
        model.min_alpha = base_min_alpha
        for epoch in range(10):
            print('epoch',mepoch * 10 + epoch)
            model.train(sentences)
            model.alpha -= 0.002  # decrease the learning rate
            model.min_alpha = model.alpha  # fix the learning rate, no decay
        shuffle(sentences)

    # saving the model    
    model.save(model_file)
    print 'model trained and saved'
else:
    model = gensim.models.Doc2Vec.load(model_file)
    print 'model loaded.',len(model.docvecs), 'vectors'
    if len(sentences) != len(model.docvecs):
        print 'something is fishy, unequal length: ',len(sentences),'sentences and',len(model.docvecs), 'vectors'

In [ ]:
# 4 Tiny helper functions

def print_word_list(wl):
    print wl
    str =  ' '.join(wl)
    pattern = re.compile('\s\W\s')
    shift = 0
    for ma in pattern.finditer(str):
        str = str[:ma.start(0)-shift]+ma.group(0)[1:]+ str[ma.end(0)-shift:]
        shift +=1
    if str[-2] == ' ':
        str = str[:-2] + str[-1:]
    return str

def get_print(sentence_or_similar):
    if type(sentence_or_similar) is gensim.models.doc2vec.LabeledSentence:
        word_list = sentence_or_similar.words
    elif type(sentence_or_similar) is int64 or type(sentence_or_similar) is int: # just an index
        word_list = sentences[sentence_or_similar].words
    else: # TaggedDocument class
        word_list = sentences[int(sentence_or_similar[0][5:])][0]
    return print_word_list(word_list)
    
def get_index_tag(sentence):
    return sentence.tags[0]

def get_index(sentence_or_similar):
    if type(sentence_or_similar) is gensim.models.doc2vec.LabeledSentence:
        return int64(get_index_tag(sentence_or_similar)[5:])
    else:
        return int64(sentence_or_similar[0][5:])
    
def equal_word_lists(index1, index2):
    wl1 = sentences[index1].words
    wl2 = sentences[index2].words
    if len(wl1) != len(wl2):
        return False
    else:
        for i in range(len(wl1)):
            if wl1[i] != wl2[i]:
                return False
    return True

def get_lab_sent_by_similar(similar):
    print get_index(similar)
    return sentences[get_index(similar)]

def get_similarity_by_index(index1, index2):
    return model.docvecs.similarity(index1,index2)

# HTML Helper
def pack_into_elem(tag, clazz, content):
    return '<' + tag + ' class="' + clazz + '"> ' + content+ ' </' + tag +'>'

pre = '''<style>
          .act {font-weight: bold}
          .i {color: grey}
          .sim {color: orange}
          .n {color: blue}
          .p {color: red}
          .r {color: green}          
     </style>'''

In [ ]:
get_print(4)

In [ ]:
# 5 Test: printing sentence 9 and getting the most similar ones.

do_tests = False

if do_tests:
    test_sentence_index = 2639533
    print get_print(test_sentence_index)
    sims = model.docvecs.most_similar('SENT_'+str(test_sentence_index),topn = 30)
    print 'similar sentence',len(sims)
    print '\nSIMILAR SENTENCES\n'
    for sim in sims:
        print get_print(sim),sim
        
# 6 Test: iterate over similar sentences
# needs the sentences loaded (cell 2)
if do_tests:
    index = 1983
    # len(sentences)
    # print sentences[index]
    sentence = get_print(index)
    print sentence
    selected_indices = [index]

    for sentence in range(10):
        sims = model.docvecs.most_similar('SENT_'+str(index))
        while True:
            selected = random.choice(sims)
            check_index = int(selected[0][5:])
            if check_index not in selected_indices:
                break
        index = check_index
        selected_indices.append(index)
        print get_print(selected)

In [ ]:
# 7 Story Treenode class

PARENT = -1
QUIT = -2
NEXT = -3

num_similars = 20
num_random = 20

class LabSentTreeNode:
    
    def __init__(self, labeledSentence, parent = None):
        self.sentence = labeledSentence
        self.sentence_index = get_index(self.sentence)
        self.similars = self.get_similars()  
        self.randoms = self.get_randoms()
        self.children = {} # index: SentenceTreeNode
        self.selected_child = '' # None
        self.parent = parent
             
    def get_similars(self):
        return model.docvecs.most_similar(get_index_tag(self.sentence),topn = num_similars)
    
    def get_randoms(self):
        randoms = []
        for index in range(num_random):
            rnd_sen = sentences[random.randint(0,len(sentences))]
            randoms.append(rnd_sen)     
        return randoms
                
    def print_options(self):
        for index, sentence in enumerate(self.similars):
            add = '(*)' if get_similar_index(sentence) in self.children else ''
            print index, add, get_print(sentence), "%.3f" % sentence[1]
        if self.parent:
            print 'p: ', get_print(self.parent.sentence)
        if self.sentence_index < len(sentences) - 2:
            print 'n: ', get_print(sentences[self.sentence_index + 1])           
        for index,sentence in enumerate(self.randoms):
            print 'r'+str(index) +": ",  get_print(sentence)
            
    def get_options_html(self):
        html = ''
        for index, sentence in enumerate(self.similars):
            content = pack_into_elem('span','',get_print(sentence))
            index = pack_into_elem('span','',str(index)+': ')
            sentence_index = get_index(sentence)
            index += '❗️' if sentence_index in added_sentences else ''
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similar = pack_into_elem('span','sim',("%.3f" % sentence[1]) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', '', index + content + similar)
        if self.parent:
            content = pack_into_elem('span','',get_print(self.parent.sentence))
            index = pack_into_elem('span','','P: ')
            sentence_index = self.parent.sentence_index
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similarity = get_similarity_by_index(self.sentence_index, self.parent.sentence_index)
            similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', 'p', index + content + similar)
        if self.sentence_index < len(sentences) - 2:
            content = pack_into_elem('span','n',get_print(sentences[self.sentence_index + 1]))
            index = pack_into_elem('span','','N: ')
            sentence_index = self.sentence_index + 1
            index += '❗️' if sentence_index in added_sentences else ''
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similarity = get_similarity_by_index(self.sentence_index, self.sentence_index + 1)
            similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', 'n', index + content + similar)
        for index,sentence in enumerate(self.randoms):
            content = pack_into_elem('span','',get_print(sentence))
            index = pack_into_elem('span','','R'+str(index)+': ')
            sentence_index = get_index(sentence)
            index += '❗️' if sentence_index in added_sentences else ''
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similarity = get_similarity_by_index(self.sentence_index, get_index(sentence))
            similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', 'r', index + content + similar)
        html += pack_into_elem('div', '', 'Q: Quit 💣')
        return html
    
    def get_sentence_html(self):
        return pack_into_elem('p', 'act', get_print(self.sentence))
    
    def select_child(self):
        u_input = raw_input('Next child: ')
        if u_input == 'p':
            selected_index = PARENT
        elif u_input == 'q':
            return None
        elif u_input == 'n':
            selected_index = NEXT
        elif u_input.startswith('r'):
            selected_index = 100 + int(u_input[1:])
        else:
            try:
                selected_index = int(u_input)
            except ValueError:
                return self
        if selected_index >= 0 and selected_index < len(self.similars):
            lab_sent = get_lab_sent_by_similar(self.similars[selected_index])
            child =  LabSentTreeNode(lab_sent, self)
            self.children[u_input] = child
            self.selected_child = u_input
            return child
        elif selected_index >= 100 and selected_index < len(self.randoms) + 100:
            #print 'random sen'
            child =  LabSentTreeNode(self.randoms[selected_index - 100], self)
            self.children[u_input] = child
            self.selected_child = u_input   
            return child
        elif selected_index == PARENT and self.parent:
            return self.parent
        elif selected_index == NEXT:
            child =  LabSentTreeNode(sentences[self.sentence_index + 1], self)
            self.children[u_input] = child
            self.selected_child = u_input   
            return child
        # a weird number
        return self
        
    def toJSON(self):
        children_toJSON = {}
        for child_index in self.children:
            children_toJSON[child_index] = self.children[child_index].toJSON()
            
        return {'sentence':get_print(self.sentence),
                'index':get_index(self.sentence),
               'children':children_toJSON,
                'selected_child':self.selected_child
               }

In [ ]:
# 8 Story creator/log helper functions

def get_story(root_node):
    act_sentence = root_node
    story = []
    while act_sentence:
        story.append(get_print(act_sentence.sentence))
        if act_sentence.selected_child  != '':
            act_sentence = act_sentence.children[act_sentence.selected_child]
        else:
            break
    return story

def log_json(root_node):
    with open('logs/log-'+str(number_logs)+'.json','w') as output:
        output.write(json.dumps(root_node.toJSON(),indent=2))
    
def log_story(root_node):
    story = get_story(root_node)
    with open('logs/story-'+str(number_logs)+'.txt','w') as output:
        for l in story:
            output.write(l+'\n')   
    
def print_story(root_node):
    story = get_story(root_node)
    for l in story:
        print(l)   

def dump_story(root_node):
    with open('logs/story-'+str(number_logs)+'.dump','w') as dump_file:
        pickle.dump(root_node,dump_file)
        
# one list comprehension!!!
def concordance(search_word,sen_range = 0):
    sentence_indices = []
    for index,sen in enumerate(sentences):
        wl = sen[0]
        for word in wl:
            if word == search_word:
                sentence_indices.append(index)
#                 print 'o',get_print(index)
                for r in range(sen_range):
                    # prevent missing element
                    sentence_indices.append(index-r)
                    sentence_indices.append(index+r)
    return sentence_indices

def concordance_result(sentences,sen_range = 0):
    if len(sentences) < 10:
        for sentence in sentences:
            print sentence, get_print(sentence)
    else:
        print len(sentences),'sentences. 5 random ones:'
        for i in range(5):
            print sentence, get_print(random.choice(sentences))

def ask_word_input():
    indices = []
    while len(indices) == 0:
        first_input = raw_input('First input: ')
        indices = concordance(first_input)
    print len(indices), 'Sentences with',first_input
    sentence = sentences[random.choice(indices)]
    return sentence

In [ ]:
# test... doesn't work
# print model.most_similar([sentence1,sentence2],[sentence3])

In [ ]:
# An Alternative method of creating stories.
# Start with a Start term and end End term.
# up to 'inbetweens' sentences inbetween will be added

import numpy as np
from gensim import matutils

inbetweens = 20

sentence1 = ask_word_input()
print get_print(sentence1)
print '-----'
sentence2 = ask_word_input()
print get_print(sentence2)

vec1 = model.docvecs[get_index_tag(sentence1)]
vec2 = model.docvecs[get_index_tag(sentence2)]


ps = ([np.linspace(vec1[index],vec2[index],inbetweens) for index,v in enumerate(vec1)])
ar = np.ndarray(shape=(inbetweens,100), dtype=float)
for ind,v in enumerate(ps):
    for i in range(inbetweens):
        ar[i][ind] = v[i]

story = []
topn = 3
model.docvecs.init_sims()
for index, v in enumerate(ar):
    print (inbetweens - index - 1),
    dists = np.dot(model.docvecs.doctag_syn0norm, v)
    best = matutils.argsort(dists, topn, reverse=True)
    ind = 0
    while best[ind] in story:
        ind +=1
        if ind == topn:
            break

    if ind < topn:
        # Hard sentence equality comparison
        contains = [sen for sen in story if equal_word_lists(best[ind],sen)]
        if not contains:
            story.append(best[ind])
            print '*',

clear_output()        
for ind in story:
    print get_print(ind)

In [ ]:
#### 9 Story creator

load_from_log = True

added_sentences = set()  

number_logs = max(0,len(filter(lambda file_ : file_.endswith('dump'),
                          os.listdir("logs"))) - 1)

if load_from_log:
    with open('logs/story-'+str(number_logs)+'.dump','r') as in_file:   
        root_node = pickle.load(in_file)
        actual_node = root_node
        while actual_node.selected_child != '':
            added_sentences.add(actual_node.sentence_index)
            actual_node = actual_node.children[actual_node.selected_child]            
else:  
    sentence = ask_word_input()
    root_node = LabSentTreeNode(sentence)
    actual_node = root_node
  
    print get_print(sentence)
while actual_node:
    clear_output()
    log_json(root_node)
    log_story(root_node)
    dump_story(root_node)
    added_sentences.add(actual_node.sentence_index)
    display(HTML(pre + actual_node.get_sentence_html() + actual_node.get_options_html()))
    time.sleep(0.4)
    actual_node = actual_node.select_child()
clear_output()
print '📖 ⭐ 📖'
print_story(root_node)
print '👋🏽'

In [ ]:
print model.docvecs.doctag_syn0norm

In [ ]:
# TRYING TO TRAIN MORE
# RUNS BUT DOESNT EXTEND THE MODEL FILE
# print len(model.docvecs)
# line = 'Therefore, if you tell me the truth, they are not going to reject what you say.'
# ls = gensim.models.doc2vec.LabeledSentence(words=line.split(), tags=['SENT_%s' % len(sentences)])
# sentences.append(ls)
# model.train([ls])
# print len(model.docvecs)
# print sentences[len(sentences)-1]
# get_similarity_by_index(len(sentences)-1,400)

The next part is for speech Synthesis in the browser get your story spoken

  1. get the english voices out of the browser
  2. let it speak. tweak some parameters...

In [ ]:
# 1 get the voices out of the browser. js > python :)
display(HTML('''
<script>
voices = window.speechSynthesis.getVoices();
eng_voices = Array()
for(var i=0; i < voices.length; i++){
    if(voices[i].lang == "en-US") {
        eng_voices.push(voices[i].name);
    }
}
console.log(eng_voices);
IPython.notebook.kernel.execute('voices='+eng_voices);'''))
print voices

In [ ]:
pitch = 1.0
rate = 1.0
voice = 'Junior'

text = get_story(root_node)
var = 'var text= "'+' '.join(text) + '";'

html = '''
<script>
var synth = window.speechSynthesis;
var   voices = synth.getVoices();

'''+ var +'''
  var utterThis = new SpeechSynthesisUtterance(text);
  var selectedOption = "'''+voice+'''"
  for(i = 0; i < voices.length ; i++) {
    if(voices[i].name === selectedOption) {
      utterThis.voice = voices[i];
    }
  }
  utterThis.pitch = '''+str(pitch)+''';
  utterThis.rate = '''+str(rate)+''';
  synth.speak(utterThis);

</script>
'''
display(HTML(html))

Search for all sentences with a specific word


In [ ]:
search_word = 'motherfucker'
sentence_indices = []
le = len(sentences)

for index,sen in enumerate(sentences):
    wl = sen[0]
    for word in wl:
        if word == search_word:
            sentence_indices.append(index)

In [ ]:
print len(sentence_indices) 
print(sentences[sentence_indices[2]])
print get_print(sentences[sentence_indices[2]])

Model accuracy function outputs bullsh...


In [ ]:
# not sure what crap output that is...
model.accuracy('questions-words.txt')