In [ ]:
import codecs,re,os


def create_vocabulary(data):
    vocab = {}
    for line in data:
        for item in line:
            if item in vocab:
                vocab[item] += 1
            else:
                vocab[item] = 1
    vocab_list = sorted(vocab)
    vocab = dict([(x, y+1) for (y, x) in enumerate(vocab_list)])
    return vocab


def symbols_to_ids(symbols, vocab):
    ids = [vocab.get(s) for s in symbols]
    return ids


def split_to_grapheme_phoneme(inp_dictionary):
    graphemes, phonemes = [], []
    for line in inp_dictionary:
        split_line = re.split('[ _]', line.strip())
        if len(split_line) > 1:
            graphemes.append(list(split_line[0]))
            phonemes.append(split_line[1:])
    return graphemes, phonemes


def collect_pronunciations(dic_lines):
    dic = {}
    for line in dic_lines:
        lst = line.strip().split()
        if len(lst) > 1:
            if lst[0] not in dic:
                dic[lst[0]] = [" ".join(lst[1:])]
            else:
                dic[lst[0]].append(" ".join(lst[1:]))
    return dic


def split_dictionary(train_path, valid_path=None, test_path=None):
    source_dic = codecs.open(train_path, "r", "utf-8").readlines()
    train_dic, valid_dic, test_dic = [], [], []
    if valid_path:
        valid_dic = codecs.open(valid_path, "r", "utf-8").readlines()
    if test_path:
        test_dic = codecs.open(test_path, "r", "utf-8").readlines()

    dic = collect_pronunciations(source_dic)
    
    for i, word in enumerate(dic):
        for pronunciations in dic[word]:
            train_dic.append(word + ' ' + pronunciations)
    return train_dic, valid_dic, test_dic


def prepare_g2p_data(train_path, valid_path, test_path):
    train_dic, valid_dic, test_dic = split_dictionary(train_path, valid_path, test_path)
    train_gr, train_ph = split_to_grapheme_phoneme(train_dic)
    valid_gr, valid_ph = split_to_grapheme_phoneme(valid_dic)

    ph_vocab = create_vocabulary(train_ph)
    gr_vocab = create_vocabulary(train_gr)
            
    train_ph_ids = [symbols_to_ids(line, ph_vocab) for line in train_ph]
    train_gr_ids = [symbols_to_ids(line, gr_vocab) for line in train_gr]
    valid_ph_ids = [symbols_to_ids(line, ph_vocab) for line in valid_ph]
    valid_gr_ids = [symbols_to_ids(line, gr_vocab) for line in valid_gr]

    return train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab, test_dic

In [ ]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, GRU
from keras.layers import Dense, Activation, Masking, Embedding, Bidirectional,BatchNormalization
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard,EarlyStopping
from keras.utils.np_utils import to_categorical
import tensorflow as tf
import numpy as np
import random
np.random.seed(100)
random.seed(100)
tf.set_random_seed(100)

In [ ]:
train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab, test_lines=\
prepare_g2p_data('train1.txt',None,'test.csv')
ph_rev_vocab=dict((x,y) for (y,x) in ph_vocab.items())

In [ ]:
test=[]
for line in test_lines:
    if not line.startswith('Id'):
        test.append(symbols_to_ids(line.strip().split(',')[1],gr_vocab))

In [ ]:
padded_tr_gr_ids=sequence.pad_sequences(train_gr_ids,padding='post',truncating='post')
num_timesteps=padded_tr_gr_ids.shape[1]
padded_tr_ph_ids=sequence.pad_sequences(train_ph_ids,maxlen=num_timesteps,padding='post',truncating='post')

inp_voc_size=len(gr_vocab)
outp_voc_size=len(ph_vocab)
test_padded=sequence.pad_sequences(test,maxlen=num_timesteps,padding='post',truncating='post')

In [ ]:
tr_gr=to_categorical(padded_tr_gr_ids,num_classes=inp_voc_size+1).reshape(padded_tr_gr_ids.shape[0],padded_tr_gr_ids.shape[1],inp_voc_size+1)
tr_ph=to_categorical(padded_tr_ph_ids,num_classes=outp_voc_size+1).reshape(padded_tr_ph_ids.shape[0],padded_tr_ph_ids.shape[1],outp_voc_size+1)
ts_gr=to_categorical(test_padded,num_classes=inp_voc_size+1).reshape(test_padded.shape[0],test_padded.shape[1],inp_voc_size+1)

In [ ]:
tr_gr[:,:,0]=0
ts_gr[:,:,0]=0

In [ ]:
def tr(a):
    return ','.join(sorted('='.join(list(map(lambda x:str(x)[:10] if type(x)!=dict else tr(x)[:10],
                                             item))) for item in a.items()))

In [ ]:
def funct(params,i=None,save=False):
    print('Params testing:\n',params,end='\n')
    model = Sequential()
    
    if params['unit']=='LSTM':
        unit=LSTM
    else:
        unit=GRU
        
    model.add(Bidirectional(unit(params['num_1'],recurrent_dropout=params['dropout_1'], 
                                 return_sequences=True,implementation=2),merge_mode=params['merge_mode'],
                           input_shape=(num_timesteps,inp_voc_size+1)))
    
    for _ in range(params['enc_layers']):
        model.add(unit(params['num_2'],recurrent_dropout=params['dropout_2'],
                       return_sequences=True,implementation=2))

    model.add(Dense(outp_voc_size+1))
    model.add(Activation('softmax'))
    
    if params['opt']['type']=='adadelta':
        opt=keras.optimizers.adadelta()
    elif params['opt']['type']=='adam':
        opt=keras.optimizers.adam()
    else:
        opt=keras.optimizers.rmsprop(lr=params['opt']['learning_rate'])
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    model.fit(tr_gr,tr_ph,batch_size=params['batch_size'],epochs=100,verbose=2,
              callbacks=[EarlyStopping(min_delta=0.0001,monitor='loss',patience=5),TensorBoard(log_dir='./logs/'+tr(params))])
    if save:
        pred=model.predict_classes(ts_gr,verbose=2)
        res=["_".join([ph_rev_vocab[elem] for elem in sym[sym!=0]]) for sym in pred]
        with open('hopt_pred'+str(i)+'.csv','w+') as outp:
            outp.write("Id,Transcription\n")
            for i,word in enumerate(res):
                outp.write(str(i+1)+','+word+'\n')
    else:
        loss, score = model.evaluate(tr_gr,tr_ph,verbose=2)
    return {'loss':loss,'status':STATUS_OK,'score':score}

In [ ]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials,space_eval
space={
    'enc_layers':hp.randint('enc_layers',5),
    'dec_layers':hp.randint('dec_layers',5),
    'merge_mode':hp.choice('merge_mode',['concat','ave','sum']),
    'unit':hp.choice('unit',['LSTM','GRU']),
    'num_1':hp.choice('num_1',[128,256,512,1024]),
    'dropout_1':hp.uniform('dropout_1',0.01,0.1),
    'num_2':hp.choice('num_2',[128,256,512,1024]),
    'dropout_2':hp.uniform('dropout_2',0.01,0.1),
    'batch_size':hp.choice('batch_size',[64,128,256,512,1024]),
    'opt':hp.choice('opt',[
        {'type':'adadelta'},
        {'type':'adam'},
        {'type':'rmsprop','learning_rate':hp.uniform('learning_rate',0.0001,0.001)}
    ])
}

In [ ]:
trials=Trials()
best=fmin(funct,space,algo=tpe.suggest,trials=trials,max_evals=50)

In [ ]:
trials1=list(trials)
trials1.sort(key=lambda x:x['result']['loss'])

In [ ]:
best_params=list(map(lambda x:space_eval(space,x),
                     [{key:value[0] for key,value in tr['misc']['vals'].items() if len(value)>0}
                    for tr in trials1[:10]]))

In [ ]:
for i,params in enumerate(best_params):
    funct_best(params,save=True,i)