Copyright 2019 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Google Cloud API を REST インタフェースから利用するために、 API Key を利用します。 Google Cloud Console から API Key をコピーしましょう。
In [0]:
import getpass
APIKEY = getpass.getpass()
In [0]:
#@title このセルを実行して record_audio を定義
# Install required libraries and packages
!pip install -qq pydub
!apt-get -qq update
!apt-get -qq install -y ffmpeg
# Define record_audio
import base64
import google.colab
import pydub
from io import BytesIO
def record_audio(file_id, framerate=16000, file_format='flac'):
# Record webm file from Colaboratory.
audio = google.colab._message.blocking_request(
'user_media',
{
'audio': True,
'video': False,
'duration': -1
},
timeout_sec=600)
# Convert web file into in_memory file.
mfile = BytesIO(base64.b64decode(audio[audio.index(',')+1:]))
# Store webm file locally.
with open('{0}.webm'.format(file_id), 'wb') as f:
mfile.seek(0)
f.write(mfile.read())
# Open stored web file and save it as wav with sample_rate=16000
output_file = '{0}.{1}'.format(file_id, file_format)
_ = pydub.AudioSegment.from_file('{0}.webm'.format(file_id), codec='opus')
_.set_frame_rate(16000).export(output_file, format=file_format)
return output_file
record_audio
を実行して音声を録音します。
試しに 「昨日は東京タワーに行きました。楽しかったです。」と録音してみてください。
In [0]:
record_audio('ja-sample')
audio_filename='ja-sample.flac'
録音した音声を確認します。
In [0]:
from IPython.display import Audio
Audio(audio_filename, rate=16000)
In [0]:
import base64
from googleapiclient.discovery import build
# Speech-to-Text API を取得する
speech_service = build('speech', 'v1', developerKey=APIKEY)
# 音声ファイルを base64 エンコードする
with open("ja-sample.flac", 'rb') as audio:
audio_content = audio.read()
audio_content_b64 = base64.b64encode(audio_content)
_audio = {
# JSON にシリアライズできるように bytes から string に変換する
'content': audio_content_b64.decode(),
}
_recognition_config = {
'encoding': 'FLAC', # 音声コーデックを指定
'sampleRateHertz': 16000, # サンプリング周波数を指定
'languageCode': 'ja-JP', # 言語を指定
'model': 'default', # 音声認識モデルの種類を指定
}
_request_body={
'audio': _audio,
'config': _recognition_config,
}
# Speech-to-Text API に音声認識のリクエストを送信し、結果を受け取る
response = speech_service.speech().recognize(body=_request_body).execute()
source_text = response['results'][0]['alternatives'][0]['transcript']
print(source_text)
In [0]:
import pprint
pprint.pprint(response)
レスポンスから日本語のテキストのみを抽出します。
In [0]:
from googleapiclient.discovery import build
translate_service = build('translate', 'v2', developerKey=APIKEY)
response = translate_service.translations().list(
q=source_text,
source='ja',
target='en',
model='nmt',
format='text'
).execute()
target_text = response['translations'][0]['translatedText']
In [0]:
import pprint
pprint.pprint(response)
Cloud Translation API に入力する情報を定義します。
In [0]:
from googleapiclient.discovery import build
language_service = build('language', 'v1', developerKey=APIKEY)
_document = {
'content': target_text,
'type': 'PLAIN_TEXT',
'language': 'en',
}
_body = {
'document': _document,
'encodingType': 'UTF8',
}
response = language_service.documents().analyzeEntities(body=_body).execute()
In [0]:
import pprint
pprint.pprint(response)
レスポンスの中から固有名詞を抽出します。
In [0]:
proper_nouns = []
entities = response['entities']
for entity in entities:
mentions = entity['mentions']
for mention in mentions:
if mention['type'] == 'PROPER':
proper_nouns.append(mention['text']['content'])
print(proper_nouns)
In [0]:
from googleapiclient.discovery import build
texttospeech_service = build('texttospeech', 'v1beta1', developerKey=APIKEY)
_input = {
'text': target_text
}
_voice = {
'languageCode': 'en-US',
'name': 'en-US-Wavenet-D',
}
_audio_config = {
'audioEncoding': 'MP3'
}
_body = {
'input': _input,
'voice': _voice,
'audioConfig': _audio_config
}
response = texttospeech_service.text().synthesize(body=_body).execute()
In [0]:
import base64
from IPython.display import Audio
Audio(base64.b64decode(response['audioContent']))
In [0]:
en_ssml = """<speak>{0}</speak>""".format(target_text)
for proper_noun in proper_nouns:
en_ssml = en_ssml.replace(
proper_noun,
'<prosody rate="slow" volume="loud">{0}</prosody>'.format(proper_noun))
print(en_ssml)
In [0]:
response = texttospeech_service.text().synthesize(
body={
'input': {
'ssml': en_ssml,
},
'voice': {
'languageCode': source_language,
'ssmlGender': voice_gender,
'name': voice_name,
},
'audioConfig': {
'audioEncoding': audio_encoding,
},
}
).execute()
In [0]:
import base64
from IPython.display import Audio
Audio(base64.b64decode(response['audioContent']))