Copyright 2017 Google LLC.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Updated by Martin Andrews (mdda @ GitHub) to include cleaner file upload/download code. Repo is MIT licensed.
Onsets and Frames is an automatic piano music transcription model. This notebook demonstrates running the model on user-supplied recordings. For more details on the architecture of the model, see our arXiv paper.
This colab notebook is self-contained and should run natively on google cloud. The code and checkpoints can be downloaded separately and run locally, which is recommended if you want to train your own model. Details on how to do this can be found in the GitHub repo.
In [ ]:
#@title Setup Environment
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
print('Copying checkpoint from GCS...')
!rm -r /content/onsets-frames
!mkdir /content/onsets-frames
!gsutil -q -m cp -R gs://magentadata/models/onsets_frames_transcription/* /content/onsets-frames/
!unzip -o /content/onsets-frames/checkpoint.zip -d /content/onsets-frames
CHECKPOINT_DIR = '/content/onsets-frames/train'
print('Installing dependencies...')
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev ffmpeg
!pip install pyfluidsynth pretty_midi
if glob.glob('/content/onsets-frames/magenta*.whl'):
!pip install -q /content/onsets-frames/magenta*.whl
else:
!pip install -q magenta
# Hack to allow python to pick up the newly-installed fluidsynth lib.
import ctypes.util
orig_find_library = ctypes.util.find_library
def proxy_find_library(lib):
if lib == 'fluidsynth':
return 'libfluidsynth.so.1'
else:
return orig_find_library(lib)
ctypes.util.find_library = proxy_find_library
In [ ]:
import tensorflow as tf
import librosa
import numpy as np
from google.colab import files
from magenta.common import tf_utils
from magenta.music import audio_io
import magenta.music as mm
from magenta.models.onsets_frames_transcription import model
from magenta.models.onsets_frames_transcription import constants
from magenta.models.onsets_frames_transcription import data
from magenta.models.onsets_frames_transcription import infer_util
from magenta.music import midi_io
from magenta.protobuf import music_pb2
## Define model and load checkpoint
## Only needs to be run once.
acoustic_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
print('acoustic_checkpoint=' + acoustic_checkpoint)
hparams = tf_utils.merge_hparams(
constants.DEFAULT_HPARAMS, model.get_default_hparams())
with tf.Graph().as_default():
examples = tf.placeholder(tf.string, [None])
num_dims = constants.MIDI_PITCHES
batch, iterator = data.provide_batch(
batch_size=1,
examples=examples,
hparams=hparams,
is_training=False,
truncated_length=0)
model.get_model(batch, hparams, is_training=False)
session = tf.Session()
saver = tf.train.Saver()
saver.restore(session, acoustic_checkpoint)
onset_probs_flat = tf.get_default_graph().get_tensor_by_name(
'onsets/onset_probs_flat:0')
frame_probs_flat = tf.get_default_graph().get_tensor_by_name(
'frame_probs_flat:0')
velocity_values_flat = tf.get_default_graph().get_tensor_by_name(
'velocity/velocity_values_flat:0')
One key part of the transcription is having fairly clean audio files as input. To do this conveniently, we create two folders for audio : ./orig and ./audio. The ./orig folder is where the new files are uploaded (either from your machine, or by grabbing them out of YouTube videos).
Then, using the ! cp ./orig/XYZ ./audio/ cell below, you can pick each individual audio file that you want to experiment with, without having to repeatedly doing the import steps (and you can easily switch back-and-forth too).
The original Magenta code has been modified to read the files from the ./audio folder.
Run the following cell to upload audio files.
In [ ]:
! rm ./audio/*
In [ ]:
import os
if not os.path.exists('./audio'):
os.mkdir('./audio')
if not os.path.exists('./orig'):
os.mkdir('./orig')
In [ ]:
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
open(os.path.join('./orig', fn), 'w').write(uploaded[fn])
In [ ]:
In [ ]:
# !apt-get -qq install youtube-dl
# !apt-get remove youtube-dl
! pip install youtube-dl # More recent version
Note: The best audio tends to be from studio recordings, rather than concerts. This is probably because the Magenta training set was done 'dry' (i.e. without reverb). So there should also be the opportunity to create a better model by adding some (synthetic) reverb and background noises to the training set.
In [ ]:
# https://stackoverflow.com/questions/49804874/dowload-the-best-quality-audio-file-with-youtube-dl
# Scarlatti :
yt="https://www.youtube.com/watch?v=yIAk61xEZ80"
# Scarlatti Guitar (not piano) :
#yt="https://www.youtube.com/watch?v=Q-1O6A8P5mM"
# Satie Gnossen (a bit quiet) :
#yt="https://www.youtube.com/watch?v=IUAF3abGY2M"
# Kissin Chopin Etude Etude Op. 10 No. 4
#yt="https://www.youtube.com/watch?v=0TZy4Va97xQ"
# Kissin Winter Wind (perhaps concert recordings aren't the best...) :
#yt="https://www.youtube.com/watch?v=Zsks5L2QPO0"
! youtube-dl --extract-audio --audio-format wav {yt}
! mv *.wav ./orig/
In [ ]:
! ls -l ./orig
! rm ./audio/*
# ! cp ./orig/401* ./audio/ # Peterson Someone To Watch Over Me
# ! cp ./orig/23* ./audio/ # Chopin Winter Wind
! cp "./orig/Scarlatti Sonate K.455, Yuja Wang-yIAk61xEZ80.wav" ./audio/
! ls -l ./audio
In [ ]:
#for fn in uploaded.keys():
# print('User uploaded file "{name}" with length {length} bytes'.format(
# name=fn, length=len(uploaded[fn])))
# open(fn, 'w').write(uploaded[fn])
to_process = []
for fname in os.listdir('./audio'):
fn = os.path.join('./audio', fname)
raw_audio, _sample_rate = librosa.core.load(fn, sr=hparams.sample_rate)
print(raw_audio.shape)
# These two lines restrict the audio to the first 30seconds
# Remove/modify them if you need a different segment/the whole sample.
raw_audio = raw_audio[int(hparams.sample_rate* 0.) :
int(hparams.sample_rate*30.)]
print(raw_audio.shape)
wav_data = audio_io.samples_to_wav_data(
librosa.util.normalize(raw_audio),
hparams.sample_rate)
example = tf.train.Example(features=tf.train.Features(feature={
'id':
tf.train.Feature(bytes_list=tf.train.BytesList(
value=[fn.encode('utf-8')]
)),
'sequence':
tf.train.Feature(bytes_list=tf.train.BytesList(
value=[music_pb2.NoteSequence().SerializeToString()]
)),
'audio':
tf.train.Feature(bytes_list=tf.train.BytesList(
value=[wav_data]
)),
'velocity_range':
tf.train.Feature(bytes_list=tf.train.BytesList(
value=[music_pb2.VelocityRange().SerializeToString()]
)),
}))
to_process.append(example.SerializeToString())
print('Processing complete for', fn)
In [ ]:
# Create an iterator over the files
session.run(iterator.initializer, {examples: to_process})
In [ ]:
filenames, frame_logits, onset_logits, velocity_values = session.run([
batch.filenames,
frame_probs_flat,
onset_probs_flat,
velocity_values_flat
])
print('Inference complete for', filenames[0])
frame_predictions = frame_logits > .5
onset_predictions = onset_logits > .5
sequence_prediction = infer_util.pianoroll_to_note_sequence(
frame_predictions,
frames_per_second=data.hparams_frames_per_second(hparams),
min_duration_ms=0,
onset_predictions=onset_predictions,
velocity_values=velocity_values)
mm.plot_sequence(sequence_prediction)
In [ ]:
mm.play_sequence(sequence_prediction, mm.midi_synth.fluidsynth)
In [ ]:
frame_predictions.shape # (8099, 88) Booleans
#onset_predictions.shape # (8099, 88) Booleans
#velocity_values.shape # (8099, 88) # values range :-0.51426625 ... 1.3687868
Optionally run the following cell to download a MIDI version of the inferred transcription.
In [ ]:
midi_filename = (filenames[0] + '.mid').replace(' ', '_')
midi_io.sequence_proto_to_midi_file(sequence_prediction, midi_filename)
files.download(midi_filename)