In [15]:

    
import gzip
import tarfile
import numpy as np
import pandas as pd
import h5py as h5
import os
import os.path as osp
import glob
from sklearn import preprocessing
import math
import time
from scipy.spatial.distance import cosine
from itertools import combinations

Reading the data

data has to be a .h5 data file.
data_path should contain the path to this .h5 data file.



In [44]:

    
data_path = os.path.join('MillionSongSubset', 'AdditionalFiles', 'subset_msd_summary_file.h5')
features = ['duration', 'end_of_fade_in','key', 'loudness', 'mode', 'start_of_fade_out', 'tempo', 'time_signature']
verbose = False

Helper functions



In [35]:

    
def get_feature_matrix(feature, data):
    '''
        Reads the data and the feature names and returns the track ids and the feature matrix.
        The track_id field from the data is mandatory, therefore it will always be included
        Args:
            feature_names(list of strings): list containing the feature names that will be included 
            in the feature matrix.
            data(pandas.io.pytables.HDFStore table): table containing the data.
        Returns:
            (numpy.ndarray, numpy.ndarray): (N, 1) of track_ids, feature matrix (N, D).
    '''
    if 'track_id' in feature:
        songs = np.asarray(data[osp.join('analysis','songs')][feature])
    else:
        songs = np.asarray(data[osp.join('analysis','songs')][['track_id'] + feature])
        
    return np.array(songs[:, 0]), np.array(songs[:, 1:], dtype=np.float64)

def get_random_vector(n):
    '''
        Returns a vector with normal distributed values {-1,1}.
        Args:
            n (int) : size of the vector.
        Returns:
            ndarray : list of length n of normal distributed values {-1,1}.
    '''
    return 2*np.random.randint(0, 2, n) - 1

def cosine_angle(a, b):
    '''
        Returns the cosine of the angle of two given vectors
        Args:
            a(numpy.ndarray): vector of real values.
            b(numpy.ndarray): vector of real values.
        Returns:
            double: the cosine of the angle between a and b.
    '''
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def cosine_distance(a, b):
    '''
        Returns the cosine distance between two vectors.
        Args:
            a(numpy.ndarray): vector of real values.
            b(numpy.ndarray): vector of real values.
        Returns:
            double: the cosine distance of a and b.
    '''
    return 1.0 - cosine_angle(a, b)

def get_hash_bands(sketch, r, b):
    '''
        Computes the signature matrix (#samples, #bands) with hash-values for the different song based on the respective bands.
        Args:
            sketch(numpy.ndarray): sketch matrix (#samples, #rows*#bands) with values in domain {0,1}.
            r(int): number of rows
            b(int): number of bands
        Returns:
            (numpy.ndarray, numpy.ndarray): (#samples, #bands) with hash-values for the different song based on the respective bands.
    '''
    twos = np.array([1<<i for i in range(r)])
    hashes = np.zeros((np.size(sketch, 0), b))
    for i in range(b):
        hashes[:,i] = np.dot(sketch[:, (r*i):(r*(i+1))], twos)
        
    return hashes.astype(np.uint64)



In [45]:

    
'''
Operations:
- Get data
- Build feature matrix
- 0-1 normalize it

track_ids: matrix(#samples x 1)
feature_matrix: matrix(#samples x #features)
'''
songs = pd.HDFStore(data_path)
track_ids, feature_matrix = get_feature_matrix(features, songs)
feature_matrix = preprocessing.scale(feature_matrix)

if verbose:
    print("Shape track_ids:",track_ids.shape)
    print("Shape feature matrix:",feature_matrix.shape)

LSH Cosine Similarity



In [38]:

    
# data and algorithm parameters
'''
D = number of features
N = number of samples
b = number of bands
r = number of rows
eps = angle threshold(degrees)
'''
D = np.size(feature_matrix,1)
N = np.size(feature_matrix, 0)
b = 3
r = 64
eps = 2



In [46]:

    
'''
Operations:
- Generate matrix of random vectors with values in {-1,1}.

RV: matrix(#bands*#rows x n_features)
'''
RV = np.array([get_random_vector(D) for i in range(b*r)])

if verbose:
    print("Shape RV:",np.shape(RV))
    print("Random vectors matrix RV:\n",RV)



In [47]:

    
'''
Operations:
    - Generate sketch matrix, by performing 
Clip sketch matrix to 0-1 range for hashing.
Dimensionality: n_samples x n_bands*n_rows
'''
Sketch = np.dot(feature_matrix, RV.T)
Sketch[Sketch < 0] = -1
Sketch[Sketch > 0] = 1
Sketch[Sketch == 0] = np.random.randint(0,2)*2-1


if verbose:
    print("Shape Sketch:",Sketch.shape)
    print("Sketch:\n",Sketch)



In [48]:

    
# clip values of Sketch matrix in domain {0,1} to easily hash them.
Sketch[Sketch < 0] = 0


if verbose:
    print("Shape Binary Sketch:",Sketch.shape)
    print("Binary Sketch:\n",Sketch)



In [49]:

    
hb = get_hash_bands(Sketch,r, b)

if verbose:
    print("Shape hb:",hb.shape)
    print("hb:\n",hb)

Algorithm Description

for each band b we store a hash_table in the dictionary buckets[b].
for each band b:
- for each song s:
  - add song s to the bucket to which it hashes in the dictionary buckets[b].
- for each key in buckets[b]:
  - if it contains more than one element, it contains candidates for duplication.
    - for all elements in this list, generate all unordered pairs.
    - for each such pair, if it is not contained in the candidates dictionary, add the pair with the corressponding cosine distance.
for each pair in candidates:
- check if its cosine distance < cosine distance of epsilon, meaning we consider it a duplicate.



In [43]:

    
'''
candidates(dict): Dictionary with key=(song_id,song_id), value=cosine_distance(song_id,song_id)
duplicates(list): List of tuples (songid, songid)
buckets(dict)   : Dictionary with key=band_id, value=dict with key=hash_key, value = list of song_id
'''

candidates = {}
duplicates = []
buckets = { i : {} for i in range(b) }

start = time.time()
for i in range(b):
    for j in range(N):
        hash_key = hb[j,i]
        if hash_key not in buckets[i]:
            buckets[i][hash_key] = []
        buckets[i][hash_key].append(j)
    for candidates_list in buckets[i].values():
        if len(candidates_list) > 1:
            for _i in range(len(candidates_list)):
                for _j in range(_i+1,len(candidates_list)):
                    songA = candidates_list[_i]
                    songB = candidates_list[_j]
                    if (songA,songB) not in candidates:
                        candidates[(songA,songB)] = cosine_distance(feature_matrix[songA,:],feature_matrix[songB,:])


cos_eps_dist = 1-math.cos(math.radians(eps))
for key in candidates.keys():
    if candidates[key] < cos_eps_dist:
        songA = key[0]
        songB = key[1]
        duplicates.append((songA,songB))
        

print("LSH Duration:", time.time() - start,"sec")
print("Nr. candidates:", len(candidates.keys()))
print("Nr. duplicates:",len(duplicates))









    



LSH Duration: 0.3489410877227783 sec
Nr. candidates: 20215
Nr. duplicates: 21

Results

Samples	Duration(sec)	Candidates	Duplicates	Algorithm
10000	0.684	12531	22	LSH Cosine