notebook.community

Edit and run



In [ ]:

    
import gensim
from IPython.display import clear_output
import json
import re
import random
import time
import pickle
from IPython.core.display import display, HTML
import locale, os, socket
from numpy import int64
locale.setlocale(locale.LC_ALL, 'de_DE.utf-8')

if not os.path.exists('logs'):
    os.makedirs('logs')

host = socket.gethostname() 
if host == 'lyrik':
    model_file = '/home/ramin/projects/ECO/src/python/modelbuilder/parsed_v3_valid.doc2vec'
    print "U ARE ON LYRIK"
    # ... move data and model into some convinient folder. so that model/parsed_v3_valid is there and
    # NAIL_DATAFIELD_txt/parsed_v3/parsed_v3_valid.txt is there
else:
    # local
    model_file = '../../models/parsed_v3_valid.doc2vec'    

if not os.path.isfile(model_file):
    print "MODEL FILE IS NOT THERE. GO AND FIND IT"



In [ ]:

    
# 2 Build sentence list (each sentence needs at least 1 tag)
import socket, os
import gensim

host = socket.gethostname() 
print(host)
model_file = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v4/parsed_v4_valid.doc2vec'

if host == 'lyrik':
    filename = '/home/marcel/drive/data/eco/NAIL_DATAFIELD_txt/parsed_v4/parsed_v4_valid.txt'
else:
    # local
    filename = '../../data/parsed_v3/parsed_v3_valid.txt' # parsed_v3_all.txt

if not os.path.isfile(filename):
    print "TEXTFILE FILE IS NOT THERE"    

sentences = []
from random import shuffle


for uid, line in enumerate(open(filename)):
    if uid % 1000 == 0:
        print(str(uid))
    csv_split = line.split(';')
    ls = gensim.models.doc2vec.LabeledSentence(words=csv_split[0], tags=['SENT_%s' % uid, csv_split[1]])
    sentences.append(ls)
print len(sentences),'sentences'



In [ ]:

    
# 3 TRAINING OR LOADING the doc2vec model and save it
# ALTERNATIVE: LOAD THE MODEL IN THE NEXT CELL

# tutorial https://rare-technologies.com/doc2vec-tutorial/
# proposes shuffling or learning reate adjustment. we gonna do both
# in total 20 epochs
# took ca. 6.30 hours

# FOR SAFETY REASON, BUILD ONLY WHEN FLAG IS SET

train_model = True

if train_model:
    model = gensim.models.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
    print('building vocab') 
    model.build_vocab(sentences)

    base_alpha = model.alpha
    base_min_alpha = model.min_alpha

    for mepoch in range(2):
        model.alpha = base_alpha 
        model.min_alpha = base_min_alpha
        for epoch in range(10):
            print('epoch',mepoch * 10 + epoch)
            model.train(sentences)
            model.alpha -= 0.002  # decrease the learning rate
            model.min_alpha = model.alpha  # fix the learning rate, no decay
        shuffle(sentences)

    # saving the model    
    model.save(model_file)
    print 'model trained and saved'
else:
    model = gensim.models.Doc2Vec.load(model_file)
    print 'model loaded.',len(model.docvecs), 'vectors'
    if len(sentences) != len(model.docvecs):
        print 'something is fishy, unequal length: ',len(sentences),'sentences and',len(model.docvecs), 'vectors'



In [ ]:

    
# 4 Tiny helper functions

def print_word_list(wl):
    print wl
    str =  ' '.join(wl)
    pattern = re.compile('\s\W\s')
    shift = 0
    for ma in pattern.finditer(str):
        str = str[:ma.start(0)-shift]+ma.group(0)[1:]+ str[ma.end(0)-shift:]
        shift +=1
    if str[-2] == ' ':
        str = str[:-2] + str[-1:]
    return str

def get_print(sentence_or_similar):
    if type(sentence_or_similar) is gensim.models.doc2vec.LabeledSentence:
        word_list = sentence_or_similar.words
    elif type(sentence_or_similar) is int64 or type(sentence_or_similar) is int: # just an index
        word_list = sentences[sentence_or_similar].words
    else: # TaggedDocument class
        word_list = sentences[int(sentence_or_similar[0][5:])][0]
    return print_word_list(word_list)
    
def get_index_tag(sentence):
    return sentence.tags[0]

def get_index(sentence_or_similar):
    if type(sentence_or_similar) is gensim.models.doc2vec.LabeledSentence:
        return int64(get_index_tag(sentence_or_similar)[5:])
    else:
        return int64(sentence_or_similar[0][5:])
    
def equal_word_lists(index1, index2):
    wl1 = sentences[index1].words
    wl2 = sentences[index2].words
    if len(wl1) != len(wl2):
        return False
    else:
        for i in range(len(wl1)):
            if wl1[i] != wl2[i]:
                return False
    return True

def get_lab_sent_by_similar(similar):
    print get_index(similar)
    return sentences[get_index(similar)]

def get_similarity_by_index(index1, index2):
    return model.docvecs.similarity(index1,index2)

# HTML Helper
def pack_into_elem(tag, clazz, content):
    return '<' + tag + ' class="' + clazz + '"> ' + content+ ' </' + tag +'>'

pre = '''<style>
          .act {font-weight: bold}
          .i {color: grey}
          .sim {color: orange}
          .n {color: blue}
          .p {color: red}
          .r {color: green}          
     </style>'''



In [ ]:

    
get_print(4)



In [ ]:

    
# 5 Test: printing sentence 9 and getting the most similar ones.

do_tests = False

if do_tests:
    test_sentence_index = 2639533
    print get_print(test_sentence_index)
    sims = model.docvecs.most_similar('SENT_'+str(test_sentence_index),topn = 30)
    print 'similar sentence',len(sims)
    print '\nSIMILAR SENTENCES\n'
    for sim in sims:
        print get_print(sim),sim
        
# 6 Test: iterate over similar sentences
# needs the sentences loaded (cell 2)
if do_tests:
    index = 1983
    # len(sentences)
    # print sentences[index]
    sentence = get_print(index)
    print sentence
    selected_indices = [index]

    for sentence in range(10):
        sims = model.docvecs.most_similar('SENT_'+str(index))
        while True:
            selected = random.choice(sims)
            check_index = int(selected[0][5:])
            if check_index not in selected_indices:
                break
        index = check_index
        selected_indices.append(index)
        print get_print(selected)



In [ ]:

    
# 7 Story Treenode class

PARENT = -1
QUIT = -2
NEXT = -3

num_similars = 20
num_random = 20

class LabSentTreeNode:
    
    def __init__(self, labeledSentence, parent = None):
        self.sentence = labeledSentence
        self.sentence_index = get_index(self.sentence)
        self.similars = self.get_similars()  
        self.randoms = self.get_randoms()
        self.children = {} # index: SentenceTreeNode
        self.selected_child = '' # None
        self.parent = parent
             
    def get_similars(self):
        return model.docvecs.most_similar(get_index_tag(self.sentence),topn = num_similars)
    
    def get_randoms(self):
        randoms = []
        for index in range(num_random):
            rnd_sen = sentences[random.randint(0,len(sentences))]
            randoms.append(rnd_sen)     
        return randoms
                
    def print_options(self):
        for index, sentence in enumerate(self.similars):
            add = '(*)' if get_similar_index(sentence) in self.children else ''
            print index, add, get_print(sentence), "%.3f" % sentence[1]
        if self.parent:
            print 'p: ', get_print(self.parent.sentence)
        if self.sentence_index < len(sentences) - 2:
            print 'n: ', get_print(sentences[self.sentence_index + 1])           
        for index,sentence in enumerate(self.randoms):
            print 'r'+str(index) +": ",  get_print(sentence)
            
    def get_options_html(self):
        html = ''
        for index, sentence in enumerate(self.similars):
            content = pack_into_elem('span','',get_print(sentence))
            index = pack_into_elem('span','',str(index)+': ')
            sentence_index = get_index(sentence)
            index += '❗️' if sentence_index in added_sentences else ''
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similar = pack_into_elem('span','sim',("%.3f" % sentence[1]) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', '', index + content + similar)
        if self.parent:
            content = pack_into_elem('span','',get_print(self.parent.sentence))
            index = pack_into_elem('span','','P: ')
            sentence_index = self.parent.sentence_index
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similarity = get_similarity_by_index(self.sentence_index, self.parent.sentence_index)
            similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', 'p', index + content + similar)
        if self.sentence_index < len(sentences) - 2:
            content = pack_into_elem('span','n',get_print(sentences[self.sentence_index + 1]))
            index = pack_into_elem('span','','N: ')
            sentence_index = self.sentence_index + 1
            index += '❗️' if sentence_index in added_sentences else ''
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similarity = get_similarity_by_index(self.sentence_index, self.sentence_index + 1)
            similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', 'n', index + content + similar)
        for index,sentence in enumerate(self.randoms):
            content = pack_into_elem('span','',get_print(sentence))
            index = pack_into_elem('span','','R'+str(index)+': ')
            sentence_index = get_index(sentence)
            index += '❗️' if sentence_index in added_sentences else ''
            index_distance =  locale.format('%d', abs(self.sentence_index - sentence_index), 1)
            similarity = get_similarity_by_index(self.sentence_index, get_index(sentence))
            similar = pack_into_elem('span','sim',("%.3f" % similarity) + ' / ' + str(index_distance) + ' / ' + str(self.sentence_index))
            html += pack_into_elem('div', 'r', index + content + similar)
        html += pack_into_elem('div', '', 'Q: Quit 💣')
        return html
    
    def get_sentence_html(self):
        return pack_into_elem('p', 'act', get_print(self.sentence))
    
    def select_child(self):
        u_input = raw_input('Next child: ')
        if u_input == 'p':
            selected_index = PARENT
        elif u_input == 'q':
            return None
        elif u_input == 'n':
            selected_index = NEXT
        elif u_input.startswith('r'):
            selected_index = 100 + int(u_input[1:])
        else:
            try:
                selected_index = int(u_input)
            except ValueError:
                return self
        if selected_index >= 0 and selected_index < len(self.similars):
            lab_sent = get_lab_sent_by_similar(self.similars[selected_index])
            child =  LabSentTreeNode(lab_sent, self)
            self.children[u_input] = child
            self.selected_child = u_input
            return child
        elif selected_index >= 100 and selected_index < len(self.randoms) + 100:
            #print 'random sen'
            child =  LabSentTreeNode(self.randoms[selected_index - 100], self)
            self.children[u_input] = child
            self.selected_child = u_input   
            return child
        elif selected_index == PARENT and self.parent:
            return self.parent
        elif selected_index == NEXT:
            child =  LabSentTreeNode(sentences[self.sentence_index + 1], self)
            self.children[u_input] = child
            self.selected_child = u_input   
            return child
        # a weird number
        return self
        
    def toJSON(self):
        children_toJSON = {}
        for child_index in self.children:
            children_toJSON[child_index] = self.children[child_index].toJSON()
            
        return {'sentence':get_print(self.sentence),
                'index':get_index(self.sentence),
               'children':children_toJSON,
                'selected_child':self.selected_child
               }



In [ ]:

    
# 8 Story creator/log helper functions

def get_story(root_node):
    act_sentence = root_node
    story = []
    while act_sentence:
        story.append(get_print(act_sentence.sentence))
        if act_sentence.selected_child  != '':
            act_sentence = act_sentence.children[act_sentence.selected_child]
        else:
            break
    return story

def log_json(root_node):
    with open('logs/log-'+str(number_logs)+'.json','w') as output:
        output.write(json.dumps(root_node.toJSON(),indent=2))
    
def log_story(root_node):
    story = get_story(root_node)
    with open('logs/story-'+str(number_logs)+'.txt','w') as output:
        for l in story:
            output.write(l+'\n')   
    
def print_story(root_node):
    story = get_story(root_node)
    for l in story:
        print(l)   

def dump_story(root_node):
    with open('logs/story-'+str(number_logs)+'.dump','w') as dump_file:
        pickle.dump(root_node,dump_file)
        
# one list comprehension!!!
def concordance(search_word,sen_range = 0):
    sentence_indices = []
    for index,sen in enumerate(sentences):
        wl = sen[0]
        for word in wl:
            if word == search_word:
                sentence_indices.append(index)
#                 print 'o',get_print(index)
                for r in range(sen_range):
                    # prevent missing element
                    sentence_indices.append(index-r)
                    sentence_indices.append(index+r)
    return sentence_indices

def concordance_result(sentences,sen_range = 0):
    if len(sentences) < 10:
        for sentence in sentences:
            print sentence, get_print(sentence)
    else:
        print len(sentences),'sentences. 5 random ones:'
        for i in range(5):
            print sentence, get_print(random.choice(sentences))

def ask_word_input():
    indices = []
    while len(indices) == 0:
        first_input = raw_input('First input: ')
        indices = concordance(first_input)
    print len(indices), 'Sentences with',first_input
    sentence = sentences[random.choice(indices)]
    return sentence



In [ ]:

    
# test... doesn't work
# print model.most_similar([sentence1,sentence2],[sentence3])



In [ ]:

    
# An Alternative method of creating stories.
# Start with a Start term and end End term.
# up to 'inbetweens' sentences inbetween will be added

import numpy as np
from gensim import matutils

inbetweens = 20

sentence1 = ask_word_input()
print get_print(sentence1)
print '-----'
sentence2 = ask_word_input()
print get_print(sentence2)

vec1 = model.docvecs[get_index_tag(sentence1)]
vec2 = model.docvecs[get_index_tag(sentence2)]


ps = ([np.linspace(vec1[index],vec2[index],inbetweens) for index,v in enumerate(vec1)])
ar = np.ndarray(shape=(inbetweens,100), dtype=float)
for ind,v in enumerate(ps):
    for i in range(inbetweens):
        ar[i][ind] = v[i]

story = []
topn = 3
model.docvecs.init_sims()
for index, v in enumerate(ar):
    print (inbetweens - index - 1),
    dists = np.dot(model.docvecs.doctag_syn0norm, v)
    best = matutils.argsort(dists, topn, reverse=True)
    ind = 0
    while best[ind] in story:
        ind +=1
        if ind == topn:
            break

    if ind < topn:
        # Hard sentence equality comparison
        contains = [sen for sen in story if equal_word_lists(best[ind],sen)]
        if not contains:
            story.append(best[ind])
            print '*',

clear_output()        
for ind in story:
    print get_print(ind)



In [ ]:

    
#### 9 Story creator

load_from_log = True

added_sentences = set()  

number_logs = max(0,len(filter(lambda file_ : file_.endswith('dump'),
                          os.listdir("logs"))) - 1)

if load_from_log:
    with open('logs/story-'+str(number_logs)+'.dump','r') as in_file:   
        root_node = pickle.load(in_file)
        actual_node = root_node
        while actual_node.selected_child != '':
            added_sentences.add(actual_node.sentence_index)
            actual_node = actual_node.children[actual_node.selected_child]            
else:  
    sentence = ask_word_input()
    root_node = LabSentTreeNode(sentence)
    actual_node = root_node
  
    print get_print(sentence)
while actual_node:
    clear_output()
    log_json(root_node)
    log_story(root_node)
    dump_story(root_node)
    added_sentences.add(actual_node.sentence_index)
    display(HTML(pre + actual_node.get_sentence_html() + actual_node.get_options_html()))
    time.sleep(0.4)
    actual_node = actual_node.select_child()
clear_output()
print '📖 ⭐ 📖'
print_story(root_node)
print '👋🏽'



In [ ]:

    
print model.docvecs.doctag_syn0norm



In [ ]:

    
# TRYING TO TRAIN MORE
# RUNS BUT DOESNT EXTEND THE MODEL FILE
# print len(model.docvecs)
# line = 'Therefore, if you tell me the truth, they are not going to reject what you say.'
# ls = gensim.models.doc2vec.LabeledSentence(words=line.split(), tags=['SENT_%s' % len(sentences)])
# sentences.append(ls)
# model.train([ls])
# print len(model.docvecs)
# print sentences[len(sentences)-1]
# get_similarity_by_index(len(sentences)-1,400)

The next part is for speech Synthesis in the browser get your story spoken

get the english voices out of the browser
let it speak. tweak some parameters...



In [ ]:

    
# 1 get the voices out of the browser. js > python :)
display(HTML('''
<script>
voices = window.speechSynthesis.getVoices();
eng_voices = Array()
for(var i=0; i < voices.length; i++){
    if(voices[i].lang == "en-US") {
        eng_voices.push(voices[i].name);
    }
}
console.log(eng_voices);
IPython.notebook.kernel.execute('voices='+eng_voices);'''))
print voices



In [ ]:

    
pitch = 1.0
rate = 1.0
voice = 'Junior'

text = get_story(root_node)
var = 'var text= "'+' '.join(text) + '";'

html = '''
<script>
var synth = window.speechSynthesis;
var   voices = synth.getVoices();

'''+ var +'''
  var utterThis = new SpeechSynthesisUtterance(text);
  var selectedOption = "'''+voice+'''"
  for(i = 0; i < voices.length ; i++) {
    if(voices[i].name === selectedOption) {
      utterThis.voice = voices[i];
    }
  }
  utterThis.pitch = '''+str(pitch)+''';
  utterThis.rate = '''+str(rate)+''';
  synth.speak(utterThis);

</script>
'''
display(HTML(html))

Search for all sentences with a specific word



In [ ]:

    
search_word = 'motherfucker'
sentence_indices = []
le = len(sentences)

for index,sen in enumerate(sentences):
    wl = sen[0]
    for word in wl:
        if word == search_word:
            sentence_indices.append(index)



In [ ]:

    
print len(sentence_indices) 
print(sentences[sentence_indices[2]])
print get_print(sentences[sentence_indices[2]])

Model accuracy function outputs bullsh...



In [ ]:

    
# not sure what crap output that is...
model.accuracy('questions-words.txt')