In [ ]:
import codecs,re,os
def create_vocabulary(data):
vocab = {}
for line in data:
for item in line:
if item in vocab:
vocab[item] += 1
else:
vocab[item] = 1
vocab_list = sorted(vocab)
vocab = dict([(x, y+1) for (y, x) in enumerate(vocab_list)])
return vocab
def symbols_to_ids(symbols, vocab):
ids = [vocab.get(s) for s in symbols]
return ids
def split_to_grapheme_phoneme(inp_dictionary):
graphemes, phonemes = [], []
for line in inp_dictionary:
split_line = re.split('[ _]', line.strip())
if len(split_line) > 1:
graphemes.append(list(split_line[0]))
phonemes.append(split_line[1:])
return graphemes, phonemes
def collect_pronunciations(dic_lines):
dic = {}
for line in dic_lines:
lst = line.strip().split()
if len(lst) > 1:
if lst[0] not in dic:
dic[lst[0]] = [" ".join(lst[1:])]
else:
dic[lst[0]].append(" ".join(lst[1:]))
return dic
def split_dictionary(train_path, valid_path=None, test_path=None):
source_dic = codecs.open(train_path, "r", "utf-8").readlines()
train_dic, valid_dic, test_dic = [], [], []
if valid_path:
valid_dic = codecs.open(valid_path, "r", "utf-8").readlines()
if test_path:
test_dic = codecs.open(test_path, "r", "utf-8").readlines()
dic = collect_pronunciations(source_dic)
for i, word in enumerate(dic):
for pronunciations in dic[word]:
train_dic.append(word + ' ' + pronunciations)
return train_dic, valid_dic, test_dic
def prepare_g2p_data(train_path, valid_path, test_path):
train_dic, valid_dic, test_dic = split_dictionary(train_path, valid_path, test_path)
train_gr, train_ph = split_to_grapheme_phoneme(train_dic)
valid_gr, valid_ph = split_to_grapheme_phoneme(valid_dic)
ph_vocab = create_vocabulary(train_ph)
gr_vocab = create_vocabulary(train_gr)
train_ph_ids = [symbols_to_ids(line, ph_vocab) for line in train_ph]
train_gr_ids = [symbols_to_ids(line, gr_vocab) for line in train_gr]
valid_ph_ids = [symbols_to_ids(line, ph_vocab) for line in valid_ph]
valid_gr_ids = [symbols_to_ids(line, gr_vocab) for line in valid_gr]
return train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab, test_dic
In [ ]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, GRU
from keras.layers import Dense, Activation, Masking, Embedding, Bidirectional,BatchNormalization
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard,EarlyStopping
from keras.utils.np_utils import to_categorical
import tensorflow as tf
import numpy as np
import random
np.random.seed(100)
random.seed(100)
tf.set_random_seed(100)
In [ ]:
train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab, test_lines=\
prepare_g2p_data('train1.txt',None,'test.csv')
ph_rev_vocab=dict((x,y) for (y,x) in ph_vocab.items())
In [ ]:
test=[]
for line in test_lines:
if not line.startswith('Id'):
test.append(symbols_to_ids(line.strip().split(',')[1],gr_vocab))
In [ ]:
padded_tr_gr_ids=sequence.pad_sequences(train_gr_ids,padding='post',truncating='post')
num_timesteps=padded_tr_gr_ids.shape[1]
padded_tr_ph_ids=sequence.pad_sequences(train_ph_ids,maxlen=num_timesteps,padding='post',truncating='post')
inp_voc_size=len(gr_vocab)
outp_voc_size=len(ph_vocab)
test_padded=sequence.pad_sequences(test,maxlen=num_timesteps,padding='post',truncating='post')
In [ ]:
tr_gr=to_categorical(padded_tr_gr_ids,num_classes=inp_voc_size+1).reshape(padded_tr_gr_ids.shape[0],padded_tr_gr_ids.shape[1],inp_voc_size+1)
tr_ph=to_categorical(padded_tr_ph_ids,num_classes=outp_voc_size+1).reshape(padded_tr_ph_ids.shape[0],padded_tr_ph_ids.shape[1],outp_voc_size+1)
ts_gr=to_categorical(test_padded,num_classes=inp_voc_size+1).reshape(test_padded.shape[0],test_padded.shape[1],inp_voc_size+1)
In [ ]:
tr_gr[:,:,0]=0
ts_gr[:,:,0]=0
In [ ]:
def tr(a):
return ','.join(sorted('='.join(list(map(lambda x:str(x)[:10] if type(x)!=dict else tr(x)[:10],
item))) for item in a.items()))
In [ ]:
def funct(params,i=None,save=False):
print('Params testing:\n',params,end='\n')
model = Sequential()
if params['unit']=='LSTM':
unit=LSTM
else:
unit=GRU
model.add(Bidirectional(unit(params['num_1'],recurrent_dropout=params['dropout_1'],
return_sequences=True,implementation=2),merge_mode=params['merge_mode'],
input_shape=(num_timesteps,inp_voc_size+1)))
for _ in range(params['enc_layers']):
model.add(unit(params['num_2'],recurrent_dropout=params['dropout_2'],
return_sequences=True,implementation=2))
model.add(Dense(outp_voc_size+1))
model.add(Activation('softmax'))
if params['opt']['type']=='adadelta':
opt=keras.optimizers.adadelta()
elif params['opt']['type']=='adam':
opt=keras.optimizers.adam()
else:
opt=keras.optimizers.rmsprop(lr=params['opt']['learning_rate'])
model.compile(loss='categorical_crossentropy',
optimizer=opt,
metrics=['accuracy'])
model.fit(tr_gr,tr_ph,batch_size=params['batch_size'],epochs=100,verbose=2,
callbacks=[EarlyStopping(min_delta=0.0001,monitor='loss',patience=5),TensorBoard(log_dir='./logs/'+tr(params))])
if save:
pred=model.predict_classes(ts_gr,verbose=2)
res=["_".join([ph_rev_vocab[elem] for elem in sym[sym!=0]]) for sym in pred]
with open('hopt_pred'+str(i)+'.csv','w+') as outp:
outp.write("Id,Transcription\n")
for i,word in enumerate(res):
outp.write(str(i+1)+','+word+'\n')
else:
loss, score = model.evaluate(tr_gr,tr_ph,verbose=2)
return {'loss':loss,'status':STATUS_OK,'score':score}
In [ ]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials,space_eval
space={
'enc_layers':hp.randint('enc_layers',5),
'dec_layers':hp.randint('dec_layers',5),
'merge_mode':hp.choice('merge_mode',['concat','ave','sum']),
'unit':hp.choice('unit',['LSTM','GRU']),
'num_1':hp.choice('num_1',[128,256,512,1024]),
'dropout_1':hp.uniform('dropout_1',0.01,0.1),
'num_2':hp.choice('num_2',[128,256,512,1024]),
'dropout_2':hp.uniform('dropout_2',0.01,0.1),
'batch_size':hp.choice('batch_size',[64,128,256,512,1024]),
'opt':hp.choice('opt',[
{'type':'adadelta'},
{'type':'adam'},
{'type':'rmsprop','learning_rate':hp.uniform('learning_rate',0.0001,0.001)}
])
}
In [ ]:
trials=Trials()
best=fmin(funct,space,algo=tpe.suggest,trials=trials,max_evals=50)
In [ ]:
trials1=list(trials)
trials1.sort(key=lambda x:x['result']['loss'])
In [ ]:
best_params=list(map(lambda x:space_eval(space,x),
[{key:value[0] for key,value in tr['misc']['vals'].items() if len(value)>0}
for tr in trials1[:10]]))
In [ ]:
for i,params in enumerate(best_params):
funct_best(params,save=True,i)