In [54]:
import gzip
import tarfile
import numpy as np
import pandas as pd
import h5py as h5
import os
import glob
from sklearn import preprocessing
import math
import time
from scipy.spatial.distance import cosine
from itertools import combinations
In [55]:
# 1 million summary data. Takes long!
data_path = 'msd_summary_file.h5'
# subset(10000 songs) summary data
data_path = 'MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5'
features = ['duration', 'end_of_fade_in','key', 'loudness', 'mode', 'start_of_fade_out','tempo','time_signature']
debug_print = True
In [56]:
# Comment out, when debugging.
np.random.seed(42)
In [57]:
'''
Reads the data and the feature names and returns the track ids and the feature matrix.
The track_id field from the data is mandatory, therefore it will always be included
Args:
feature_names(list of strings): list containing the feature names that will be included in the feature matrix.
data(pandas.io.pytables.HDFStore table): table containing the data.
Returns:
(numpy.ndarray, numpy.ndarray): matrix(#samples x 1) of track_ids, feature matrix(#samples x #features).
'''
def get_feature_matrix(feature_names,data):
if 'track_id' in feature_names:
songs = np.asarray(data['/analysis/songs'][feature_names])
else:
songs = np.asarray(data['/analysis/songs'][['track_id'] + feature_names])
#indices = np.random.choice(range(np.size(songs,0)), sample_size)
return np.array(songs[:,0]),np.array(songs[:,1:],dtype=np.float64)
'''
Returns a vector with normal distributed {0,1} values.
Args:
n(int) : size of the vector.
Returns:
list(int): list of length n of normal distributed {0,1} values.
'''
def get_random_vector(n):
#rv = np.random.normal(0, 1, n)
#rv[rv < 0] = -1
#rv[rv > 0] = 1
#rv[rv == 0] = np.random.randint(0,2)*2-1
rv = 2*np.random.randint(0,2,n)-1
return rv
'''
Returns the cosine of the angle of two given vectors
Args:
a(numpy.ndarray): vector of real values.
b(numpy.ndarray): vector of real values.
Returns:
double: Returns the cosine of the angle of a and b.
'''
def cosine_angle(a,b):
return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))
'''
Returns the cosine distance between two vectors.
Args:
a(numpy.ndarray): vector of real values.
b(numpy.ndarray): vector of real values.
Returns:
double: Returns the cosine distance of a and b.
'''
def cosine_distance(a,b):
return 1.0-cosine_angle(a,b)
'''
Returns a matrix(#samples x #bands) with hash-values for the different song based on the respective bands.
Args:
Sketch(numpy.ndarray): Sketch matrix(#samples x #rows*#bands) with values in domain {0,1}.
r(int): number of rows
b(int): number of bands
Returns:
(numpy.ndarray, numpy.ndarray): Returns a matrix(#samples x #bands) with hash-values for the different song based on the respective bands.
'''
def get_hash_bands(Sketch, r,b):
twos = np.array([1<<i for i in range(r)])
Hashes = np.zeros((np.size(Sketch,0),b))
for i in range(b):
Hashes[:,i] = np.dot(Sketch[:,(r*i):(r*(i+1))], twos)
return Hashes.astype(np.uint64)
In [58]:
'''
Operations:
- Get data
- Build feature matrix
- 0-1 normalize it
track_ids: matrix(#samples x 1)
feature_matrix: matrix(#samples x #features)
'''
songs = pd.HDFStore(data_path)
track_ids, feature_matrix = get_feature_matrix(features, songs)
feature_matrix = preprocessing.scale(feature_matrix)
if debug_print:
print("Shape track_ids:",track_ids.shape)
print("Shape feature matrix:",feature_matrix.shape)
In [59]:
# data and algorithm parameters
'''
D = number of features
N = number of samples
b = number of bands
r = number of rows
eps = angle threshold(degrees)
'''
D = np.size(feature_matrix,1)
N = np.size(feature_matrix, 0)
b = 3
r = 64
eps = 2
In [60]:
'''
Operations:
- Generate matrix of random vectors with values in {-1,1}.
RV: matrix(#bands*#rows x n_features)
'''
RV = np.array([get_random_vector(D) for i in range(b*r)])
if debug_print:
print("Shape RV:",np.shape(RV))
print("Random vectors matrix RV:\n",RV)
In [61]:
'''
Operations:
- Generate sketch matrix, by performing
Clip sketch matrix to 0-1 range for hashing.
Dimensionality: n_samples x n_bands*n_rows
'''
Sketch = np.dot(feature_matrix, RV.T)
Sketch[Sketch < 0] = -1
Sketch[Sketch > 0] = 1
Sketch[Sketch == 0] = np.random.randint(0,2)*2-1
if debug_print:
print("Shape Sketch:",Sketch.shape)
print("Sketch:\n",Sketch)
In [62]:
# clip values of Sketch matrix in domain {0,1} to easily hash them.
Sketch[Sketch < 0] = 0
if debug_print:
print("Shape Binary Sketch:",Sketch.shape)
print("Binary Sketch:\n",Sketch)
In [63]:
hb = get_hash_bands(Sketch,r, b)
if debug_print:
print("Shape hb:",hb.shape)
print("hb:\n",hb)
In [64]:
'''
candidates(dict): Dictionary with key=(song_id,song_id), value=cosine_distance(song_id,song_id)
duplicates(list): List of tuples (songid, songid)
buckets(dict) : Dictionary with key=band_id, value=dict with key=hash_key, value = list of song_id
'''
candidates = {}
duplicates = []
buckets = { i : {} for i in range(b) }
start = time.time()
for i in range(b):
for j in range(N):
hash_key = hb[j,i]
if hash_key not in buckets[i]:
buckets[i][hash_key] = []
buckets[i][hash_key].append(j)
for candidates_list in buckets[i].values():
if len(candidates_list) > 1:
for _i in range(len(candidates_list)):
for _j in range(_i+1,len(candidates_list)):
songA = candidates_list[_i]
songB = candidates_list[_j]
if (songA,songB) not in candidates:
candidates[(songA,songB)] = cosine_distance(feature_matrix[songA,:],feature_matrix[songB,:])
cos_eps_dist = 1-math.cos(math.radians(eps))
for key in candidates.keys():
if candidates[key] < cos_eps_dist:
songA = key[0]
songB = key[1]
duplicates.append((songA,songB))
print("LSH Duration:", time.time() - start,"sec")
print("Nr. candidates:", len(candidates.keys()))
print("Nr. duplicates:",len(duplicates))
In [ ]: