In [1]:
%matplotlib inline
from IPython.display import Audio
import matplotlib.pyplot as plt
import numpy as np
import inspire
In [2]:
# Get the evaluation setting
setting = inspire.get_evaluation_setting()
# Get the dataset
dataset_filename = setting.download_dataset()
dataset = inspire.load_dataset(dataset_filename)
In [3]:
dataset_audio_filename = inspire.download_dataset_audio()
In [4]:
token_id = '36504'
sample_rate, signal_audio, noise_audio = inspire.get_token_audio(token_id, dataset_audio_filename, dataset)
In [5]:
plt.plot(noise_audio, label='noise')
plt.plot(signal_audio, label='speech')
plt.legend()
_ = plt.title('Audio of token {}'.format(token_id))
In [6]:
mix_audio = signal_audio + noise_audio
In [7]:
plt.plot(mix_audio, label='mix')
plt.legend()
_ = plt.title('Audio of token {}'.format(token_id))
In [8]:
Audio(data=signal_audio, rate=sample_rate)
Out[8]:
In [9]:
Audio(data=noise_audio, rate=sample_rate)
Out[9]:
In [10]:
Audio(data=mix_audio, rate=sample_rate)
Out[10]:
In [11]:
# Get the lexicon
lexicon_filename = setting.download_lexicon()
lexicon = inspire.load_lexicon(lexicon_filename)
In [12]:
submission = inspire.Submission(email='dummy@email.com',
description='''SNR-based Trivial Alignment:
where task: For the phoneme positions we predict a 50% chance of observing a confusion. We predict a 1% chance of confusion at the inter-phoneme positions and a 30% chance of confusion at the positions before and after the utterance.
what task: Not provided.
full task: Not provided.
''',
evaluation_setting=setting)
In [13]:
# Iterate over all the stimuli in our dataset
for token_id, token in dataset['tokens'].items():
if 'responses' not in token:
continue
# Lexicon may contain multiple pronunciations (we arbitrarily select the first)
word = token['speech']
pronunciation = lexicon[word][0]
# Possible indices of confusions
# are the number of phonemes plus
# the number of positions around phonemes
index_count = len(pronunciation)*2 + 1
confusion_probabilities = np.zeros(index_count)
#
sample_rate, signal_audio, noise_audio = inspire.get_token_audio(token_id, dataset_audio_filename, dataset)
signal_power = np.array([x.sum() for x in np.array_split(np.square(signal_audio), index_count)])
mix_power = np.array([x.sum() for x in np.array_split(np.square(signal_audio) + np.square(noise_audio), index_count)])
smr = (signal_power / mix_power)
confusion_probabilities[0] = 0.5
confusion_probabilities[-1] = 0.5
confusion_probabilities[2:-2:2] = 0.01
confusion_probabilities[1::2] = (1.0 - smr[1::2])*0.3
submission.where_task(token_id, confusion_probabilities)
In [14]:
submission.save('submission_snr_trivialalignment.json')
inspire.pprint(submission['tokens']['36504'])
In [ ]:
job = submission.evaluate(password='dummypassword')
job.wait()
In [17]:
result = job.result()
inspire.pprint(result['where']['token_averaged'])
In [18]:
inspire.pprint(dataset['tokens']['36504'])
In [19]:
edit_scripts = inspire.get_edit_scripts(lexicon['mandan'][0], lexicon['manos'][0])
for edit_script in edit_scripts:
inspire.print_edit_script(edit_script)
print('---')
In [ ]: