This practical uses the audio fingerprinting code from https://github.com/dpwe/audfprint .
You should download it as a directory audfprint/
into the same directory as this notebook file.
This fingerprint, based on the description in Avery Wang's paper, arranges local peaks in the spectrogram into pairs, then builds an index mapping hashes based on the frequencies and time spacing of the pairs to every track that contains them. A good match is found when multiple common hashes have the same relative timing in query and reference.
In [1]:
%pylab inline
from __future__ import print_function
import cPickle as pickle
import os
import time
import IPython
import numpy as np
import librosa
#os.chdir('/Users/dpwe/Downloads/e4896/elene4896/prac12')
from audfprint import audfprint_analyze
from audfprint import audfprint_match
from audfprint import hash_table
from audfprint import audio_read
In [2]:
def read_list_file(filename):
"""Read a text file with one item per line."""
items = []
with open(filename, 'r') as f:
for line in f:
items.append(line.strip())
return items
def my_imshow(data, **kwargs):
"""Wrapper for imshow that sets common defaults."""
plt.imshow(data, interpolation='nearest', aspect='auto',
origin='bottom', cmap='gray_r', **kwargs)
In [3]:
# Read in a local query file.
# You can download these files at http://www.ee.columbia.edu/~dpwe/e4896/code/prac12/01-Taxman.mp3
# and http://www.ee.columbia.edu/~dpwe/e4896/code/prac12/Come_Together.mp3
filename = '01-Taxman.mp3'
#filename = 'Come_Together.mp3'
y, sr = librosa.load(filename, sr=None)
print("sr=", sr, "y.shape=", y.shape)
In [4]:
# The Analyzer object manages the creation of landmark hashes for the query.
analyzer = audfprint_analyze.Analyzer()
analyzer.target_sr = sr
analyzer.density = 10 # Target number of hashes per second.
peaks = analyzer.find_peaks(y, sr)
print("len(peaks)=", len(peaks))
print(peaks[:5])
frame_hop = 256 # Analyzer always uses 512 point FFTs with 256 point advance.
frame_rate = sr / float(frame_hop)
print("frame_rate=", frame_rate, "Hz")
In [5]:
# Plot the peaks that were found over the equivalent spectrogram.
sgram = np.abs(librosa.stft(y, n_fft=512, hop_length=256))
plt.figure(figsize=(10,5))
my_imshow(20*np.log10(np.maximum(1e-3, sgram)))
peaks = np.array(peaks)
plot(peaks[:, 0], peaks[:, 1], '.r')
xlim([1000, 1500])
ylim([0, 256])
xlabel("time / frames")
ylabel("freq / bins")
Out[5]:
In [6]:
# Convert the individual peaks into landmark pairs by finding
# nearby instances.
landmarks = np.array(analyzer.peaks2landmarks(peaks))
print("len(landmarks)=", len(landmarks))
# Each row of landmarks contains the start time bin, the start and end frequency bins, and the time separation.
print(" t0 f1 f2 dt")
print(landmarks[:5])
In [7]:
# To look up in the database, each (f1, f2, dt) triple is converted into a 20-bit binary number.
hashes = audfprint_analyze.landmarks2hashes(landmarks)
print("len(hashes)=", len(hashes))
print("time hash")
print(hashes[:5])
In [8]:
# We can convert the hashes back into the (f1, f2, dt) triples and plot them on the spectrogram.
def show_hashes(hashes, d=None, sr=None, color='b'):
"""Plot the hashes on a spectrogram."""
if d is not None:
sgram = np.abs(librosa.stft(d, n_fft=512, hop_length=256))
my_imshow(20*np.log10(np.maximum(1e-3, sgram)))
landmarks = np.array(audfprint_analyze.hashes2landmarks(hashes))
t0 = landmarks[:, 0]
f1 = landmarks[:, 1]
f2 = landmarks[:, 2]
dt = landmarks[:, 3]
plot([t0, t0 + dt], [f1, f2], '-' + color)
plot([t0, t0 + dt], [f1, f2], '.' + color)
plt.figure(figsize=(10,5))
show_hashes(hashes, y, sr)
xlim([1000, 1500])
ylim([0, 256])
Out[8]:
audfprint
will take a list of audio files and build a big database from all their hashes.
I precomputed one from the 32 kbps MP3 versions of the 8752 tracks that comprise
uspop2002. For
reference, the exact command is below. I ran it on parallel on 8 cores and it took
about 20 minutes to ingest all the files.
porkpie:/scratch/dpwe > time python ~/projects/audfprint/python/audfprint.py new -d uspop-n10-b20.fpdb -W ~drspeech/data/music/mp3s-32k -V .mp3 -r 16000 -l ~drspeech/data/music/mp3s-mp3s.txt -H 8 -n 10 -b 20
hash_table 0 has 1094 files 2616250 hashes
hash_table 1 has 1094 files 2654923 hashes
hash_table 2 has 1094 files 2636462 hashes
hash_table 3 has 1094 files 2710421 hashes
hash_table 4 has 1094 files 2620202 hashes
hash_table 5 has 1094 files 2692489 hashes
hash_table 6 has 1094 files 2647007 hashes
hash_table 7 has 1094 files 2643872 hashes
Saved fprints for 8752 files ( 21151809 hashes) to uspop-n10-b20.fpdb
Dropped hashes= 6571527 (31.07%)
16653.208u 1197.230s 37:32.89 792.3% 0+0k 0+109080io 0pf+0w
You can download the database as uspop-n10-b20.fpdb (55 MB).
In [9]:
# Read in the precomputed fingerprint database.
# Hack to allow loading of pickled class when audfprint used as module.
sys.modules['hash_table'] = hash_table
# To keep it small, the database was built with density 10 hashes/sec (-n 10)
# and a maximum of 20 entries per hash (-b 20).
database_file = 'uspop-n10-b20.fpdb'
database = hash_table.HashTable(filename=database_file)
In [10]:
# This database has full paths for the names. Define a function to return a shorter name.
def track_name(database, index):
return database.names[index][49:-4]
In [11]:
# Main contents of hash table is database.table, which holds track/time values
# for occurrence of each hash. Show the content for one bin from the original
# query track. The bottom 14 bits of each entry hold the time offset, and the
# top 18 bits (i.e. value/16384) hold the track index.
hash_number = 100
print('\n'.join([track_name(database, i) for i in database.table[hashes[hash_number][1]] / 16384 if i > 0]))
print(database.table[hashes[hash_number][1]] % 16384)
In [12]:
# The Matcher object manages matching a query track against a database.
matcher = audfprint_match.Matcher()
results = matcher.match_hashes(database, hashes)
# Rows in results are [id, filt_count, time_skew, raw_count, orig_rank, min_time, max_time].
print(" id #match t_skew #raw")
print(results[:, :4])
print("name for id", results[0, 0], "is", track_name(database, results[0, 0]))
In [13]:
# We can actually break down what match_hashes does to get a little more insight.
# First, return all the hits from the database entries for the hashes present in the query.
hits = database.get_hits(hashes)
# hits has one row per hash hit with columns [id time_skew_ref_query hash time_in_query]
print("hits.shape=", hits.shape)
print(" id t_skew hash t_query")
print(hits[:5])
result_rank = 0
match_id = results[result_rank, 0]
match_t_skew = results[result_rank, 2]
match_hits = np.nonzero(hits[:, 0] == match_id)[0]
print("#hits for result #", result_rank, "(id", match_id, ")=", len(match_hits))
plot(hits[match_hits, 3], hits[match_hits, 1], '.')
#xlim([8000, 10000])
#ylim([-100, 0])
xlabel('time_in_query')
ylabel('time_skew')
matching_hashes = matcher._unique_match_hashes(match_id, hits, match_t_skew)
# Matching hashes are returned as (time_in_query, hash) rows.
print("matching_hashes.shape=", matching_hashes.shape)
print(" t_query hash")
print(matching_hashes[:5])
In [14]:
# Plot all the query hashes, then overplot the matching ones in red.
plt.figure(figsize=(10,5))
show_hashes(hashes, y, sr)
show_hashes(matching_hashes, color='r')
xlim([0, 2000])
ylim([0, 256])
Out[14]:
In [15]:
# Define a function to take a waveform and return the matches.
def match_hashes(query_hashes, database, matcher, verbose=False):
"""Given a query already converted to hashes, do match."""
match_results = matcher.match_hashes(database, query_hashes)
for index, result in enumerate(match_results):
track_id, match_count, time_offset, raw_count = result[:4]
if verbose:
print("Match", index, ":", track_name(database, track_id),
"aligned", match_count, "of", raw_count, "raw hash matches",
"at time offset", time_offset)
return match_results
def match_waveform(y, sr, database, analyzer, matcher, verbose=False):
"""Convert waveform to hashes, return match result."""
peaks = analyzer.find_peaks(y, sr)
landmarks = np.array(analyzer.peaks2landmarks(peaks))
query_hashes = audfprint_analyze.landmarks2hashes(landmarks)
return (match_hashes(query_hashes, database, matcher, verbose),
query_hashes)
results, query_hashes = match_waveform(y, sr, database, analyzer, matcher, verbose=True)
In [16]:
# Utilty functions to plot the scatter of landmark alignments,
# and the hashes over the spectrogram.
def filter_hits_for_match(hits, match_id, match_t_skew=None):
"""Filter a list of returns from database.get_hits for a particular ID (and perhaps time skew)."""
# hits has one row per hash hit with columns [id time_skew_ref_query hash time_in_query].
num_hits = hits.shape[0]
if match_t_skew is None:
# Return all hits for this ID.
t_skew_ok = np.ones(num_hits)
else:
# Filter for hits with consistent timing.
# How close per-landmark time skew has to be to count as matching overall time skew.
t_thresh = 1
t_skew_ok = np.abs(hits[:, 1] - match_t_skew) <= t_thresh
match_hits = np.nonzero(np.logical_and(hits[:, 0] == match_id, t_skew_ok))[0]
# Return full (ID, t_skew, hash, time) rows.
return hits[match_hits]
def display_match_alignment(database, query_hashes, match_id):
"""Plot a scatter of matching query and reference hashes for a certain ID."""
match_hits = filter_hits_for_match(database.get_hits(query_hashes), match_id)
print("#match_hits=", len(match_hits))
plot(match_hits[:, 3], match_hits[:, 1], '.')
xlabel('time_in_query')
ylabel('time_skew')
return match_hits
def display_match_hashes(y, sr, database, query_hashes, match_id, match_t_skew):
"""Show all the hashes for query, and the match subset."""
matching_hashes = filter_hits_for_match(database.get_hits(query_hashes),
match_id, match_t_skew)[:, [3, 2]]
# Plot all the query hashes, then overplot the matching ones in red.
show_hashes(query_hashes, y, sr)
show_hashes(matching_hashes, color='r')
return matching_hashes
# So we could regenerate the images from above with simply:
# _ = display_match_alignment(database, query_hashes, results[0, 0])
# _ = display_match_hashes(y, sr, database, query_hashes, results[0, 0], results[0, 2])
In [17]:
# Helper functions to generate and test a whole set of queries.
def excerpt_waveform(y, sr, start_time_seconds, duration_seconds):
"""Excerpt the indicated time range from a waveform."""
result = []
y_duration_seconds = y.shape[0] / float(sr)
if start_time_seconds > y_duration_seconds:
start_time_seconds = y_duration_seconds
if start_time_seconds < -duration_seconds:
start_time_seconds = -duration_seconds
if start_time_seconds < 0:
result = [np.zeros(int(round(sr * -start_time_seconds)))]
duration_seconds += start_time_seconds
start_time_seconds = 0
end_time_seconds = start_time_seconds + duration_seconds
result.append(y[int(round(sr * start_time_seconds)) :
int(round(sr * np.minimum(end_time_seconds,
y_duration_seconds)))])
if y_duration_seconds < end_time_seconds:
result.append(np.zeros(int(round(sr * (end_time_seconds -
y_duration_seconds)))))
return np.concatenate(result)
def generate_random_excerpts(filenames, duration=30.0, wavreadfunction=librosa.load):
"""Extract random excerpts from a list of sound files."""
excerpts = []
start_times = np.zeros(len(filenames))
for index, filename in enumerate(filenames):
y, sr = wavreadfunction(filename)
file_duration = y.shape[0] / float(sr)
start_point = np.random.rand(1) * (file_duration - duration)
excerpt = excerpt_waveform(y, sr, start_point, duration)
start_times[index] = start_point
excerpts.append(excerpt)
return np.vstack(excerpts), sr, start_times
def eval_fprint(query_waveforms, sr, truth, database, analyzer, matcher, duration=None, snr=None):
"""Run a set of queries through fingerprinting, count number correct."""
num_queries = len(query_waveforms)
all_results = np.zeros((num_queries, 4), int)
for index, query in enumerate(query_waveforms):
processed_query = query.copy()
if snr is not None:
processed_query += np.sqrt(np.mean(query ** 2)) * (10 ** (-snr/20.0)) * np.random.randn(query.shape[0])
if duration is not None:
processed_query = processed_query[:int(sr * duration)]
result, _= match_waveform(processed_query, sr, database, analyzer, matcher)
if len(result) > 0:
all_results[index] = result[0][:4]
print("Correct rate=", "{:.3f}".format(np.mean(all_results[:, 0] == truth)))
print("Avg matching hashes=", "{:.1f}".format(np.mean(all_results[:, 1].astype(float))))
return all_results
In [18]:
# We can get the reference item audio via the lab's web server.
# and read them into the workspace
import urllib2
import tempfile
URL_BASE = 'http://labrosa.ee.columbia.edu:8013/uspop'
def read_mp3_from_url(url, sr=None):
"""Read a soundfile from a URL."""
response = urllib2.urlopen(url)
mp3data = response.read()
response.close()
handle = tempfile.NamedTemporaryFile(delete=False)
handle.write(mp3data)
handle.close()
y, sr = librosa.load(handle.name, sr=sr)
os.remove(handle.name)
return y, sr
# Pull one uspop track from the web server by its ID.
def read_track_from_server(id_):
return read_mp3_from_url(URL_BASE + '/' + id_ + '.mp3', sr=None)
track_id = track_name(database, 356)
print(track_id)
y, sr = read_track_from_server(track_id)
print(sr, y.shape)
IPython.display.Audio(data=y, rate=sr)
Out[18]:
In [19]:
# A list of all the IDs, taken from the reference database.
ids = [track_name(database, i) for i in xrange(len(database.names))]
# A short list of indices to use as example queries.
query_indices = np.arange(0, 1000, 100)
# Create random excerpts to use as queries.
queries, sr, start_times = generate_random_excerpts([ids[i] for i in query_indices],
duration=30.0, wavreadfunction=read_track_from_server)
# Run all the excerpts against the matcher, and see how many give the right response.
results = eval_fprint(queries, sr, query_indices, database, analyzer, matcher, duration=30.0, snr=None)
print("top hit id #match t_skew #raw")
print(results)
In [20]:
# You can retrieve the hashes associated with a particular track in the database...
retrieved_hashes = database.retrieve(database.names[356])
print(len(retrieved_hashes))
# .. then you can match them against the database to find duplicates.
results = match_hashes(retrieved_hashes, database, matcher, verbose=True)
print(results)
In [ ]: