In [1]:

    
import math
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from scipy import spatial
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity

plt.rcParams["figure.figsize"] = (20,5)

import doremus_data

Load data.



In [154]:

    
training_data_folder = '/Users/pasquale/git/recommender/training_data'
emb_folder = '/Users/pasquale/git/music-embeddings'

doremus_data.init(training_data_folder, emb_folder)
vectors, uris, lbs, heads, heads_print = doremus_data.get_embeddings('artist')
pd.DataFrame(heads_print)



In [155]:

    
all_training = doremus_data.all_training('artist')



In [319]:

    
_l = 3

for t in all_training:
    temp_playlists = []
    for pl in t['playlists']:
        missing = [art not in uris for art in pl['data']]
        pl['data'] = np.delete(pl['data'], np.where(missing))
        if len(pl['data']) < 6 :
            continue
        for i in np.arange(len(pl['data']) - _l):
            temp_playlists.append(pl['data'][i:i+_l])
    t['groups'] = [{'name':str(index), 'data': pl} for index, pl in enumerate(temp_playlists)]
    print('%s\t\t%d' % (t['name'], len(temp_playlists)))









    



pp_concerts		414
itema3_concerts		1044
web-radio		2504
spotify_pl		2816



In [320]:

    
def training_stats(t):
    num_playlists = len(t['playlists'])
    num_track = [len(p['data']) for p in t['playlists']]
    distinct = len(np.unique(np.concatenate([p['data'] for p in t['playlists']])))
    return num_playlists, np.sum(num_track), np.mean(num_track), distinct

names = [t['name'] for t in all_training]
# for t in all_training:
#     num_playlists, num_track = training_stats(t)
#     print('%s\t\t%d' % (t['name'], num_playlists))
pd.DataFrame([training_stats(t) for t in all_training], index=names, columns='playlists,tracks,tracks per pl, distinct tracks'.split(','))









    Out[320]:







  
    
      
      playlists
      tracks
      tracks per pl
      distinct tracks
    
  
  
    
      pp_concerts
      223
      1031
      4.623
      715
    
    
      itema3_concerts
      414
      2324
      5.614
      1109
    
    
      web-radio
      155
      2973
      19.181
      738
    
    
      spotify_pl
      70
      3026
      43.229
      1012

Data pre-processing



In [321]:

    
negVector = -2. * np.ones_like(vectors[0], dtype=np.float32)

def get_embs(x, masked=False):
    # uri to embedding
    v = vectors[np.argwhere(uris == x)]
    if v.size == 0:
        print(x)
        result = np.array(negVector)
    else:
        result = np.array(v[0][0])
    if masked:
        result = np.ma.array(result, mask=result < -1.)
    return result

def get_label(x):
    l =  lbs[np.argwhere(uris == x)]
    return l[0][0] if l.size > 0 else 'none'



In [322]:

    
np.set_printoptions(2)
def compute_playlist_stats(playlist, to_print=False):
    pl = playlist['data']
    embeddings = np.array([get_embs(xi) for xi in pl])
    emb_len = len(embeddings[0])
    
    ma_embeddings = np.ma.array(embeddings, mask=embeddings < -1.)

    # I do not want to mean dimensions with single values
    mul_values = np.where(np.sum(embeddings >= -1., axis=0) > 1, False, True)
    mul_values = np.repeat([mul_values], len(pl), axis=0)
    ma_embeddings = np.ma.array(ma_embeddings, mask=mul_values)

    _mean = ma_embeddings.mean(axis=0)
    _median = np.ma.median(ma_embeddings, axis=0)
    _min = np.ma.min(ma_embeddings, axis=0)
    _max = np.ma.max(ma_embeddings, axis=0)
    _std = np.ma.std(ma_embeddings, axis=0)
    
    if to_print:
        plt.errorbar(np.arange(len(_mean)), _mean, _std, fmt='ok',
                     ecolor='black', elinewidth=1.5, lw=3, capsize=2)
        plt.errorbar(np.arange(len(_mean)), _mean, [_mean - _min, _max - _mean],
                 fmt='.k', ecolor='gray', elinewidth=0.5, lw=1, capsize=1)

        plt.errorbar(np.arange(len(_mean)), _median, fmt='_g',  lw=1)

        plt.xticks(range(len(heads)), heads, rotation=40)
        plt.show()

    return _mean, _median, _std

Sample playlist



In [323]:

    
base = all_training[3]
print('Base: ' + base['name'])
pl = base['groups'][1023]
print('Playlist: ' + pl['name'])

m, md, s = compute_playlist_stats(pl, True)

for d in pl['data']:
    print(d.replace('data', 'overture'))
    print(get_label(d)) 
    print(np.ma.array(get_embs(d), mask=get_embs(d) < -1.))









    



Base: spotify_pl
Playlist: 1023






    



/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/numeric.py:492: UserWarning: Warning: converting a masked element to nan.
  return array(a, dtype, copy=False, order=order)






    












    



http://overture.doremus.org/artist/10c7575e-ab13-3912-9919-62c0d3003895
Guillaume Dufay
[-- -- -- 0.3333333432674408 0.403809517621994 -0.03878294304013252
 -0.002056780969724059 0.051901187747716904 0.023812396451830864
 -0.08446286618709564 0.15615615248680115 0.05157061666250229
 0.0004409032699186355 -0.01764855533838272 -- -- --]
http://overture.doremus.org/artist/7c8d4e8d-d824-3360-808b-f72e8c88525a
Michael Praetorius
[-- -- -- 0.49619048833847046 0.5438095331192017 -0.027335748076438904
 0.04221932217478752 0.024901214987039566 0.02396547794342041
 -0.08529201149940491 0.15839993953704834 0.04896107316017151
 0.019888639450073242 -0.017865069210529327 -- -- --]
http://overture.doremus.org/artist/dd6c0b38-db65-3eec-b557-6724c3ff3cdf
Antoine Brumel
[-- -- -- -- -- -- -- -- 0.02396547794342041 -0.08529201149940491
 0.15839993953704834 0.057798922061920166 0.08197244256734848
 0.002718365518376231 -- -- --]



In [ ]:

Standard Deviation among all playlists



In [324]:

    
population_tot = np.sum([len(pl['data']) for tr in all_training for pl in tr['groups'] ])
std = {}
population = {}
mean = {}
stdw = {} # std within
stdb = {} # std between
                      
for index, tr in enumerate(all_training):
    name = tr['name']
    std[name] = []
    population[name] = []
    mean[name] = []
    for index, pl in enumerate(tr['groups']):
        _mean, _median, _std = compute_playlist_stats(pl, False)
        pop = len(pl['data'])
        population[name].append(pop)
        mean[name].append(_mean)
        ww = (pop - 1) / (population_tot - 1)
        std[name].append((_std**2)*ww)

    stdw[name] = np.ma.sum(std[name], axis=0).filled(0)



In [325]:

    
weighted_means = [np.ma.mean(mean[name], axis=0)*np.sum(population[name]) for name in mean]

mtot = np.ma.sum(weighted_means, axis=0)
mtot /= population_tot



In [326]:

    
fig, ax = plt.subplots(figsize=(20,5))
width = 0.2
pos = np.arange(len(vectors[0]))
colors = ['#3668C9', '#DA3B21', '#FD9827', '#1D9424']


for index, tr in enumerate(all_training):
    name = tr['name']
    
    ww = np.array([pop / (population_tot - 1) for pop in population[name]])
    mg = np.ma.power(np.ma.array(mean[name]).filled(mtot) - mtot, 2)
    stdb[name] = np.ma.sum(mg * ww.reshape(len(ww),1), axis=0)

    plt.bar([p + index * width for p in pos],
        stdb[name],
        width,
        alpha=0.5,
        color=colors[index],
        label=name + ' (sdt between)')
    
    plt.bar([p + index * width for p in pos],
        stdw[name] ,
        width,
        edgecolor='#000000',
        alpha=.5,
        color=colors[index],
        label=name + ' (sdt within)')
    

    
flat_std = [stdw[name] for name in stdw]
mstd = np.ma.mean(flat_std, axis=0)
sstd = np.ma.std(flat_std, axis=0)
smin = np.ma.min(flat_std, axis=0)
smax = np.ma.max(flat_std, axis=0)

ax.plot(pos, mstd, '--', label='mean')

plt.xticks(range(len(heads)), heads, rotation=40)
ax.set_ylabel('Standard Deviation')
ax.set_xticks([p + 1.5 * width for p in pos])

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc='upper left')
plt.show()

print('Standard Deviation: mean' )
print(np.array(mstd))
print('Standard Deviation: minimum' )
print(np.array(smin))

flat_std_b = [stdb[name] for name in stdb]
mstd_b = np.ma.mean(flat_std_b, axis=0)
print('Standard Deviation between: mean' )
print(np.array(mstd_b))









    












    



Standard Deviation: mean
[2.23e-05 4.37e-06 1.27e-05 4.99e-04 4.48e-04 6.77e-05 2.27e-05 3.99e-05
 5.58e-06 5.29e-06 1.36e-05 7.39e-06 4.99e-05 2.54e-05 1.85e-04 2.46e-04
 2.43e-04]
Standard Deviation: minimum
[3.98e-06 1.06e-06 2.62e-06 5.98e-05 5.17e-05 2.14e-05 3.36e-06 4.09e-06
 1.64e-06 3.51e-06 6.93e-06 1.01e-06 7.81e-06 3.64e-06 2.29e-05 4.14e-05
 3.05e-05]
Standard Deviation between: mean
[3.02e-05 6.18e-06 1.88e-05 1.94e-03 1.52e-03 8.70e-05 3.33e-05 5.70e-05
 5.71e-06 4.07e-06 1.31e-05 1.51e-05 1.12e-04 3.64e-05 2.14e-04 2.97e-04
 2.92e-04]

Explaination

When the standard deviation within (black border) is smaller then the standard deviation between (no border), this means that for that dimension the values are more homogeneous inside the group than outside. If this difference in homogeneity inside/outside is important, I can state that this dimension drives the playlist generation.

Graphs property by property



In [327]:

    
def display_graph(feat):
    pos = np.where(np.array(heads) == feat)[0]
    pos_slide = np.arange(len(pos))
    fig, ax = plt.subplots(figsize=(20,5))

    for index, tr in enumerate(all_training):
        name = tr['name']
        plt.bar([p + index * width for p in pos_slide],
            stdb[name][pos],
            width,
            alpha=0.5,
            color=colors[index],
            label=name + ' (sdt between)')

        plt.bar([p + index * width for p in pos_slide],
            stdw[name][pos] ,
            width,
            alpha=0.5,
            edgecolor='#000000',
            color=colors[index],
            label=name + ' (sdt within)')

    ax.plot(pos_slide, mstd[pos], '--', label='mean')

    plt.xticks(pos_slide, np.array(heads)[pos], rotation=40)
    ax.set_ylabel('Standard Deviation')
    ax.set_xticks([p + 1.5 * width for p in pos_slide])
#     handles, labels = ax.get_legend_handles_labels()
#     ax.legend(handles, labels, loc='upper left')
    plt.show()
    
    
for _f in heads_print[0]:
    display_graph(_f)

For concerts



In [328]:

    
flat_std = [s for name in ['pp_concerts','itema3_concerts'] for s in std[name]]
mstd = np.ma.mean(flat_std, axis=0)
sstd = np.ma.std(flat_std, axis=0)
smin = np.ma.min(flat_std, axis=0)
smax = np.ma.max(flat_std, axis=0)

print('Standard Deviation: mean' )
print(np.array(mstd))
print('Standard Deviation: minimum' )
print(np.array(smin))
print('Standard Deviation: maximum' )
print(np.array(smax))









    



Standard Deviation: mean
[7.56e-08 1.69e-08 4.53e-08 2.50e-07 2.46e-07 7.11e-08 1.27e-08 1.73e-08
 8.84e-09 5.24e-09 2.42e-08 5.97e-09 4.89e-08 1.39e-08 2.34e-07 3.16e-07
 2.67e-07]
Standard Deviation: minimum
[0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.49e-18 1.58e-17 2.49e-16
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.01e-12 9.83e-14
 1.87e-14]
Standard Deviation: maximum
[2.46e-07 8.29e-08 2.86e-07 1.17e-05 1.17e-05 6.58e-07 2.30e-07 2.16e-07
 2.30e-07 7.32e-07 7.96e-07 7.02e-08 3.29e-07 2.40e-07 3.29e-06 4.17e-06
 2.74e-06]

For playlists



In [329]:

    
flat_std = [s for name in ['web-radio','spotify_pl'] for s in std[name]]
mstd = np.ma.mean(flat_std, axis=0)
sstd = np.ma.std(flat_std, axis=0)
smin = np.ma.min(flat_std, axis=0)
smax = np.ma.max(flat_std, axis=0)


print('Standard Deviation: mean' )
print(np.array(mstd))
print('Standard Deviation: minimum' )
print(np.array(smin))
print('Standard Deviation: maximum' )
print(np.array(smax))









    



Standard Deviation: mean
[7.19e-08 1.37e-08 4.08e-08 3.20e-07 3.62e-07 4.15e-08 1.56e-08 2.81e-08
 1.77e-09 2.54e-09 3.61e-09 4.16e-09 2.60e-08 1.59e-08 1.91e-07 2.52e-07
 2.56e-07]
Standard Deviation: minimum
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Standard Deviation: maximum
[5.17e-07 1.32e-07 1.69e-07 9.69e-06 9.73e-06 9.46e-07 2.22e-07 3.84e-07
 5.79e-07 1.65e-06 7.90e-07 1.02e-07 3.63e-07 2.29e-07 4.10e-06 5.30e-06
 5.08e-06]



In [ ]:

Computing gaps



In [330]:

    
def get_std_gap(_chosen):
#     return (stdb[_chosen] - stdw[_chosen]) / ((stdw[_chosen] + stdb[_chosen]) / 2 )
    return  stdb[_chosen] / stdw[_chosen]

def display_distances(_chosen):
    better =(stdw[_chosen]<=stdb[_chosen]).tolist()
    distance = get_std_gap(_chosen)

    pd.set_option('precision', 3)
    return pd.DataFrame([better, distance.tolist(), stdw[_chosen].tolist(), stdb[_chosen].tolist()], 
                        index=['homongeneous', '% rate', 'std within', 'std between'], columns=heads)



In [331]:

    
display_distances('spotify_pl')









    Out[331]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      homongeneous
      True
      True
      True
      True
      True
      True
      True
      True
      False
      False
      False
      True
      True
      True
      True
      True
      True
    
    
      % rate
      1.53
      1.58
      1.59
      3.9
      3.63
      1.45
      1.42
      1.35
      0.739
      0.748
      0.755
      1.36
      1.63
      1.37
      1.25
      1.09
      1.17
    
    
      std within
      3.11e-05
      6.14e-06
      1.81e-05
      0.000964
      0.000832
      0.000109
      4.54e-05
      7.99e-05
      6.86e-06
      6.94e-06
      1.06e-05
      1.09e-05
      6.85e-05
      4.38e-05
      0.000316
      0.000446
      0.000454
    
    
      std between
      4.77e-05
      9.67e-06
      2.87e-05
      0.00376
      0.00302
      0.000158
      6.46e-05
      0.000108
      5.07e-06
      5.19e-06
      8.01e-06
      1.48e-05
      0.000112
      6.02e-05
      0.000395
      0.000488
      0.00053



In [332]:

    
display_distances('web-radio')









    Out[332]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      homongeneous
      True
      True
      True
      True
      True
      False
      True
      True
      False
      False
      False
      True
      True
      True
      False
      True
      True
    
    
      % rate
      1.25
      1.3
      1.4
      2.9
      2.09
      0.958
      1.33
      1.31
      0.816
      0.735
      0.787
      1.47
      1.05
      1.15
      1
      1.16
      1.05
    
    
      std within
      4.41e-05
      8.22e-06
      2.45e-05
      0.000729
      0.000711
      0.000106
      3.55e-05
      6.59e-05
      2.58e-06
      6.57e-06
      8.61e-06
      1.1e-05
      6.84e-05
      3.99e-05
      0.000309
      0.000383
      0.000386
    
    
      std between
      5.51e-05
      1.07e-05
      3.44e-05
      0.00212
      0.00149
      0.000101
      4.74e-05
      8.61e-05
      2.11e-06
      4.83e-06
      6.77e-06
      1.61e-05
      7.17e-05
      4.58e-05
      0.000309
      0.000445
      0.000404



In [333]:

    
display_distances('pp_concerts')









    Out[333]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      homongeneous
      True
      True
      True
      True
      True
      True
      True
      True
      True
      False
      True
      True
      True
      True
      True
      True
      True
    
    
      % rate
      1.34
      1.71
      1.74
      8.66
      7.04
      1.95
      1.76
      1.97
      1.2
      0.816
      1.17
      2.01
      1.31
      2.31
      1.55
      1.97
      2
    
    
      std within
      3.98e-06
      1.06e-06
      2.62e-06
      5.98e-05
      5.17e-05
      2.14e-05
      3.36e-06
      4.09e-06
      1.64e-06
      4.13e-06
      6.93e-06
      1.01e-06
      7.81e-06
      3.64e-06
      2.29e-05
      4.14e-05
      3.05e-05
    
    
      std between
      5.34e-06
      1.81e-06
      4.55e-06
      0.000518
      0.000363
      4.18e-05
      5.9e-06
      8.04e-06
      1.97e-06
      3.37e-06
      8.11e-06
      2.02e-06
      1.03e-05
      8.43e-06
      3.54e-05
      8.15e-05
      6.09e-05



In [334]:

    
display_distances('itema3_concerts')









    Out[334]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      homongeneous
      True
      True
      True
      True
      True
      True
      True
      True
      True
      False
      True
      True
      True
      True
      True
      True
      True
    
    
      % rate
      1.29
      1.26
      1.32
      5.65
      6.07
      1.35
      2.29
      2.73
      1.22
      0.826
      1.04
      4.15
      4.65
      2.21
      1.25
      1.5
      1.69
    
    
      std within
      9.92e-06
      2.05e-06
      5.71e-06
      0.000241
      0.000197
      3.46e-05
      6.68e-06
      9.57e-06
      1.12e-05
      3.51e-06
      2.84e-05
      6.64e-06
      5.48e-05
      1.41e-05
      9.33e-05
      0.000116
      0.000102
    
    
      std between
      1.28e-05
      2.58e-06
      7.55e-06
      0.00136
      0.00119
      4.66e-05
      1.53e-05
      2.62e-05
      1.37e-05
      2.9e-06
      2.96e-05
      2.76e-05
      0.000255
      3.12e-05
      0.000116
      0.000173
      0.000173

2 different tendences between concerts and playlists.

In concerts all the dimensions (where they exist) are more homogeneous. This is true in particular for the casting (not to be used instead for playlists), and this is reasonable.

The 1st dimension of composer is also one to take in account. In Itema3 this is not visible probably because of bad interlinking of artists.

For the keys, the values are not so relevant and not stable in positiveness.

Always positive (in bold the largely ones):

composer[0]
composer[1,2]
genre[2]
composition_date
key[2]
casting[2]

Positive only in concerts:

casting[all]
genre[all]
composition_date

Tuning the recommender system



In [335]:

    
def compute_weights(threshold=1.4, fallback = .6, datasets =['spotify_pl']):
    dist = [get_std_gap(_chosen) for _chosen in datasets]
    dist = np.ma.mean(dist, axis=0).filled()

    return np.where(dist > threshold, dist, fallback)

w = compute_weights()
pd.DataFrame([w], columns=heads)









    Out[335]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      0
      1.535
      1.576
      1.588
      3.899
      3.631
      1.45
      1.422
      0.6
      0.6
      0.6
      0.6
      0.6
      1.629
      0.6
      0.6
      0.6
      0.6



In [336]:

    
def get_pool_from_datasets(datasets=['web-radio','spotify_pl']):
    all_song = []
    for t in all_training:
        if t['name'] in datasets:
            for pl in t['playlists']:
                missing = [art not in uris for art in pl['data']]
                pl['data'] = np.delete(pl['data'], np.where(missing))
                all_song = np.concatenate([all_song, pl['data']])
    all_song = np.unique(all_song)
    print('Pool size: %d' % len(all_song))
    all_song_vec = np.ma.array([get_embs(xi, masked=True) for xi in all_song])
#     all_song_vec = np.ma.array(all_song_vec, mask=all_song_vec < -1.)
    all_song_labels = np.array([get_label(xi) for xi in all_song])
    return all_song, all_song_vec, all_song_labels



In [337]:

    
def computeSimilarity(seed, target, w):
    b1 = np.where(seed.mask==True)[0]
    b2 = np.where(target.mask==True)[0]
    bad_pos = np.unique(np.concatenate([b1, b2]))

    _seed = np.delete(seed, bad_pos, axis=0)
    _target = np.delete(target, bad_pos, axis=0)
    _w = np.delete(w, bad_pos, axis=0)

    if len(_seed) == 0:
        return 0

    # distance
    d = weightedL2(_seed, _target, _w)

    # how much info I am not finding
    penalty = len([x for x in b2 if x not in b1]) / len(seed)

    # score
    s = (max_distance - d) / max_distance
    return s * (1 - penalty)


def weightedL2(a, b, w=1):
#     return distance.cosine(a,b)
    # https://stackoverflow.com/a/8861999/1218213
    q = a - b
    return np.sqrt((w * q * q).sum())
#     return (w * q * q).sum()


_ones = np.ones(vectors[0].shape)
max_distance = weightedL2(_ones,-_ones, _ones)



In [338]:

    
def find(seed, n=4, w=None, _print=True, pool=get_pool_from_datasets()):
    global max_distance
    
    _uris =  pool[0]
    _vectors = pool[1]
    _lbs = pool[2]

    
    f_length = len(seed)
    
    _seed = seed
    
    if w is None:
        w = np.ones(len(_seed))
        w = w / w.sum()
    else:
        w = np.array(w)
#         temp = [np.ones(f_length[k]) * w[k] for k in range(len(w))]
#         w = np.array([item for sublist in temp for item in sublist])

    max_distance = weightedL2(np.ones(len(_seed)), np.ones(len(_seed)) * -1, w)

    if _print==True: print('computing scores')
    scores = np.array([[computeSimilarity(_seed, x.astype(float), w) for x in _vectors]])
    full = np.concatenate([_uris.reshape(len(_uris), 1), scores.transpose(), _lbs.reshape(len(_uris), 1)], axis=1)

    # remove the seed from the list
#     full = np.delete(full, pos, 0)

    # sort
    full_sorted = sorted(full, key=lambda _x: float(_x[1]), reverse=True)
    most_similar = full_sorted[:n]
    if _print==True: print('\n'.join('%s %s\n%s' % (f[0], f[1], f[2]) for f in most_similar))

    return [{'uri': _a[0], 'score': float(_a[1])} for _a in most_similar]









    



Pool size: 1396



In [360]:

    
find(get_embs('http://data.doremus.org/artist/b34f92ab-ad86-361b-a8b8-5c3a4db784d0', masked=True))









    



computing scores
http://data.doremus.org/artist/b34f92ab-ad86-361b-a8b8-5c3a4db784d0 1.0
Antonio Vivaldi
http://data.doremus.org/artist/a82b0c56-ccdf-31b6-a8b4-ad065f3405e5 0.9964103762560376
Tomaso Albinoni
http://data.doremus.org/artist/aa9564af-3ccd-36dc-8112-c6f0a67dfdbe 0.9951215281279058
Francesco Geminiani
http://data.doremus.org/artist/d07ec69a-da7c-36ab-a098-44c0cdc9741d 0.9946926919629068
Alessandro Scarlatti






    Out[360]:





[{'score': 1.0,
  'uri': 'http://data.doremus.org/artist/b34f92ab-ad86-361b-a8b8-5c3a4db784d0'},
 {'score': 0.9964103762560376,
  'uri': 'http://data.doremus.org/artist/a82b0c56-ccdf-31b6-a8b4-ad065f3405e5'},
 {'score': 0.9951215281279058,
  'uri': 'http://data.doremus.org/artist/aa9564af-3ccd-36dc-8112-c6f0a67dfdbe'},
 {'score': 0.9946926919629068,
  'uri': 'http://data.doremus.org/artist/d07ec69a-da7c-36ab-a098-44c0cdc9741d'}]



In [361]:

    
find(get_embs('http://data.doremus.org/artist/b82c0771-5280-39af-ad2e-8ace2f4ebda3', masked=True))









    



computing scores
http://data.doremus.org/artist/b82c0771-5280-39af-ad2e-8ace2f4ebda3 1.0
Giuseppe Verdi
http://data.doremus.org/artist/e432aa13-207f-34f8-9792-d1d15fcd4711 0.9911241891983096
Georges Bizet
http://data.doremus.org/artist/08499468-c1b5-35d2-958f-134bc11c9c7a 0.9909344312170096
Ruggero Leoncavallo
http://data.doremus.org/artist/95bb7ab4-648d-3d95-beae-4dfc17530aec 0.9904420982664395
Benjamin Godard






    Out[361]:





[{'score': 1.0,
  'uri': 'http://data.doremus.org/artist/b82c0771-5280-39af-ad2e-8ace2f4ebda3'},
 {'score': 0.9911241891983096,
  'uri': 'http://data.doremus.org/artist/e432aa13-207f-34f8-9792-d1d15fcd4711'},
 {'score': 0.9909344312170096,
  'uri': 'http://data.doremus.org/artist/08499468-c1b5-35d2-958f-134bc11c9c7a'},
 {'score': 0.9904420982664395,
  'uri': 'http://data.doremus.org/artist/95bb7ab4-648d-3d95-beae-4dfc17530aec'}]



In [362]:

    
find(get_embs('http://data.doremus.org/artist/03954109-0253-35d6-a70e-89ab27dea09c', masked=True))









    



computing scores
http://data.doremus.org/artist/03954109-0253-35d6-a70e-89ab27dea09c 1.0
Franz Schubert
http://data.doremus.org/artist/4802a043-23bb-3b8d-a443-4a3bd22ccc63 0.9873076232261836
Wolfgang Amadeus Mozart
http://data.doremus.org/artist/00f6e16d-7cd8-3d52-ad65-5f5e85e42123 0.9870996483784845
Frédéric Chopin
http://data.doremus.org/artist/6963af5e-b126-3d40-a84b-97e0b78f5452 0.9870906915807572
Ludwig van Beethoven






    Out[362]:





[{'score': 1.0,
  'uri': 'http://data.doremus.org/artist/03954109-0253-35d6-a70e-89ab27dea09c'},
 {'score': 0.9873076232261836,
  'uri': 'http://data.doremus.org/artist/4802a043-23bb-3b8d-a443-4a3bd22ccc63'},
 {'score': 0.9870996483784845,
  'uri': 'http://data.doremus.org/artist/00f6e16d-7cd8-3d52-ad65-5f5e85e42123'},
 {'score': 0.9870906915807572,
  'uri': 'http://data.doremus.org/artist/6963af5e-b126-3d40-a84b-97e0b78f5452'}]



In [370]:

    
find(get_embs('http://data.doremus.org/artist/bcf39e82-e208-3049-b550-1feaae6071a6', masked=True))









    



computing scores
http://data.doremus.org/artist/bcf39e82-e208-3049-b550-1feaae6071a6 1.0
Claude Debussy
http://data.doremus.org/artist/f9a2ac39-a62d-3be2-8abb-e564de0ec96d 0.9931615991322305
Johannes Brahms
http://data.doremus.org/artist/157157ef-1cb3-3fc7-a4f7-2531989fc942 0.9909424255922206
Richard Strauss
http://data.doremus.org/artist/72b3b303-5c15-319f-9c31-dc73b5b0fe74 0.9908471438205945
Jean Sibelius






    Out[370]:





[{'score': 1.0,
  'uri': 'http://data.doremus.org/artist/bcf39e82-e208-3049-b550-1feaae6071a6'},
 {'score': 0.9931615991322305,
  'uri': 'http://data.doremus.org/artist/f9a2ac39-a62d-3be2-8abb-e564de0ec96d'},
 {'score': 0.9909424255922206,
  'uri': 'http://data.doremus.org/artist/157157ef-1cb3-3fc7-a4f7-2531989fc942'},
 {'score': 0.9908471438205945,
  'uri': 'http://data.doremus.org/artist/72b3b303-5c15-319f-9c31-dc73b5b0fe74'}]



In [339]:

    
def recommend_compare(playlist, w, pos=-1, num_candidates=[100, 200, 500],
                      verbose=True, pool=get_pool_from_datasets(), overture=True):
    pl_data = playlist['data']
    pl_population = len(pl_data)
    if verbose: print('%d items | %s' % (pl_population, playlist['name'].split('/')[-1]))
    
    _replcm = 'overture.' if overture else 'data.'
    
    if pos < 0:
        pos = random.randrange(pl_population)
   
    chosen = pl_data[pos]
    targets = pl_data
#     [max(pos-7,0):min(pos+7, len(pl_data))]
    targets_pop = len(targets)
#     print(max(pos-5,0))
#     print(min(pos+5, len(pl_data)))
#     print(targets_pop)

    if verbose: 
        print('seed: %d) %s' % (pos, get_label(chosen)))
        print('\t '+ chosen.replace('data.', _replcm))

    
    first = get_embs(chosen, masked=True)

    candidates = find(first, n=np.max(num_candidates), _print=False, pool=pool )
    candidates_2 = find(first, n=np.max(num_candidates), _print=False, w = w, pool=pool )
    
    results = np.zeros((len(num_candidates), 3))
    
    for qi, q in enumerate(num_candidates):
        trues_flat = 0
        for index, c in enumerate(candidates[:q]):
            if c['uri'] == chosen: continue
            if c['uri'] in targets: trues_flat+=1
#             if verbose:
#                 _sig = '  X  ' if c['uri'] in targets else '   '
#                 print('%d \t %.5f'% (index, c['score']) + '\t' + _sig + '\t' + get_label(c['uri']))
#                 print('\t\t\t\t'+ c['uri'].replace('data.', 'overture.'))
        #     display(pd.DataFrame(np.ma.array( candidates[c]).reshape(1, 13)))


        trues = 0
        for index, c in enumerate(candidates_2[:q]):
            if c['uri'] == chosen: continue
            if c['uri'] in targets: trues+=1
#             if verbose:
#                 _sig = '  X  ' if c['uri'] in pl_data else '   '
#                 print('%d \t %.5f'% (index, c['score']) + '\t' + _sig + '\t' + get_label(c['uri']))
#                 print('\t\t\t\t'+ c['uri'].replace('data.', 'overture.'))
#                 # display(pd.DataFrame(np.ma.array( candidates[c]).reshape(1, 13)))

        if verbose: print('%d | flat %d | weighted %d | diff %d' % (q, trues_flat, trues, trues-trues_flat))
        results[qi] = [trues / targets_pop, trues_flat / targets_pop, (trues-trues_flat) / targets_pop]
    return results









    



Pool size: 1396



In [340]:

    
# w = compute_weights(threshold=1.2, fallback=1., datasets=['spotify_pl'])
pl = all_training[3]['playlists'][12]
recommend_compare(pl, w)









    



92 items | 37i9dQZF1DWWEJlAGA9gs0.Classical Essentials.artist.txt
seed: 88) Giacomo Puccini
	 http://overture.doremus.org/artist/5bcd515f-354f-30f8-ae0a-c42b65536d9f
100 | flat 17 | weighted 21 | diff 4
200 | flat 37 | weighted 38 | diff 1
500 | flat 55 | weighted 54 | diff -1






    Out[340]:





array([[ 0.23,  0.18,  0.04],
       [ 0.41,  0.4 ,  0.01],
       [ 0.59,  0.6 , -0.01]])

Computation on all playlists



In [341]:

    
out_path = './out-artists'

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return file_path

from tqdm import tqdm_notebook as tqdm



In [342]:

    
def test_recommendation(pool, playlist, pos=-1, mode='random', w=None, name='', verbose=0, overture=False, write=False):
    pl_data = playlist['data']
    pl_population = len(pl_data)
    pl_name = playlist['name'].split('/')[-1].replace('.json', '')
    
    _replcm = 'overture.' if overture else 'data.'

    if pos < 0:
        pos = random.randrange(pl_population)
    chosen = pl_data[pos]
    pl_data = np.delete(pl_data, pos, axis=0)

    if verbose > 0: 
        print('%d items | %s' % (pl_population, pl_name))
        print('seed: %d) %s' % (pos, get_label(chosen)))
        print('\t '+ chosen.replace('data.', _replcm))

    
    first = get_embs(chosen, masked=True)
    num_candidates=[100, 200, 500]
    max_candidates = np.max(num_candidates)

    if mode == 'flat':
        candidates = find(first, n=max_candidates, _print=False, pool=pool)
    elif mode == 'weighted':
        candidates = find(first, n=max_candidates, _print=False, w = w, pool=pool)
    else : # random
        candidates =  list(map(lambda x: {'uri': x, 'score': 0}, random.sample(pool[0].tolist(), 500)))

    results = np.zeros(len(num_candidates))
    candidates_uri =  list(map(lambda x: x['uri'], candidates))

    for qi, q in enumerate(num_candidates):
        trues = len(set(candidates_uri[:q]).intersection(set(pl_data)))
        
        if verbose > 0:
            print('%d | positive %d | population %d' % (q, trues, pl_population))
        results[qi] = trues

    if verbose > 1:
        for index, c in enumerate(candidates[:max_candidates]):
            _sig = '  X  ' if c['uri'] in pl_data else '   '
            print('%d \t %.5f'% (index, c['score']) + '\t' + _sig + '\t' + get_label(c['uri']))
            print('\t\t\t\t'+ c['uri'].replace('data.', _replcm))
            # display(pd.DataFrame(np.ma.array( candidates[c]).reshape(1, 13)))

    if write:
        to_write =[';'.join([
                    str(index),
                    str(c['score']), 
                    '1' if c['uri'] in pl_data else '0',
                    get_label(c['uri']),
                    c['uri'].replace('[;\n"]', ' ')
                   ])
            for index, c in enumerate(candidates[:max_candidates])]

        filename = pl_name + '.' + str(pos) + '.csv'
        with open(ensure_dir(os.path.join(out_path, 'detail', name, filename)), 'w') as file:
            file.write('index;score;predicted;label;uri\n')
            file.write('\n'.join(to_write))

    return results



In [348]:

    
def run_for_dataset(id_dataset, pool, mode='random', w=None, name=''):
    with open(ensure_dir(os.path.join(out_path, 'summary', name + '.csv')), 'w') as file:
        file.write('index;playlist;population;predicted100;predicted200;predicted500\n')
        testset = all_training[id_dataset]['playlists']
        
        pbar = tqdm(total=len(testset))

        for index, pl in enumerate(testset):
            population = len(pl['data'])
            pl_name = pl['name'].split('/')[-1].replace('.json', '').replace('"','')

            results = [test_recommendation(pool=pool, playlist=pl, pos=pos,  
                                    mode=mode, w=w, write=False, name=name)
                       for pos, work in enumerate(pl['data'])]
            results = np.mean(results,axis=0)
            
            pbar.update(1)
            if not 'ndarray' in str(type(results)):
#                 print(type(results))
                continue

            file.write(';'.join([str(index), pl_name, str(population), 
                                 str(results[0]), str(results[1]), str(results[2])]))
            file.write('\n')
    pbar.close()



In [344]:

    
# test_recommendation(pp_pool, playlist=all_training[0]['playlists'][4], mode='weighted', name='pp.w5-06', w=_wpp, verbose=2 )



In [345]:

    
# itema3_pool = get_pool_from_datasets(['itema3_concerts'])



In [346]:

    
run_for_dataset(1, itema3_pool, mode='random', name='itema3.rand')









    





 
 










    



/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)






    



<class 'numpy.float64'>
<class 'numpy.float64'>



In [347]:

    
run_for_dataset(1, itema3_pool, mode='flat', name='itema3.flat')









    





 
 










    



/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)






    



<class 'numpy.float64'>
<class 'numpy.float64'>



In [350]:

    
_wi3 = compute_weights(threshold=1.4, datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w14-06', w= _wi3)









    





 
 










    



/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)



In [ ]:

    
_wi3 = compute_weights(threshold=8, datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w8-06', w= _wi3)



In [ ]:

    
_wi3 = compute_weights(threshold=10, datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w10-06', w= _wi3)



In [ ]:

    
_wi3 = compute_weights(threshold=8, fallback=1., datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w8-1', w= _wi3)



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
_wi3 = compute_weights(threshold=5, datasets=['itema3_concerts', 'pp_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.wp5-06', w= _wi3)



In [353]:

    
pp_pool = get_pool_from_datasets(['pp_concerts'])









    



Pool size: 715



In [371]:

    
run_for_dataset(0, pp_pool, mode='random', name='pp.rand')



In [372]:

    
run_for_dataset(0, pp_pool, mode='flat', name='pp.flat')



In [373]:

    
_wpp = compute_weights(threshold=2, datasets=['pp_concerts'])
pd.DataFrame([_wpp], columns=heads)









    Out[373]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      0
      0.6
      0.6
      0.6
      8.657
      7.036
      0.6
      0.6
      0.6
      0.6
      0.6
      0.6
      2.009
      0.6
      2.315
      0.6
      0.6
      0.6



In [374]:

    
run_for_dataset(0, pp_pool, mode='weighted', name='pp.w2-06', w=_wpp)



In [376]:

    
spo_pool = get_pool_from_datasets(['spotify_pl'])









    



Pool size: 1012



In [377]:

    
run_for_dataset(3, spo_pool, mode='random', name='spotify.rand')



In [378]:

    
run_for_dataset(3, spo_pool, mode='flat', name='spotify.flat')



In [379]:

    
_wspo = compute_weights(threshold=1.2, datasets=['spotify_pl'])
run_for_dataset(3, spo_pool, mode='weighted', name='spotify.w12-06', w=_wspo)



In [399]:

    
_wspo = compute_weights(threshold=1.4, datasets=['spotify_pl'])
run_for_dataset(3, spo_pool, mode='weighted', name='spotify.w14-06', w=_wspo)



In [400]:

    
_wspo = compute_weights(threshold=1.5, datasets=['spotify_pl'])
run_for_dataset(3, spo_pool, mode='weighted', name='spotify.w15-06', w=_wspo)



In [380]:

    
radio_pool = get_pool_from_datasets(['web-radio'])









    



Pool size: 738



In [381]:

    
run_for_dataset(2, radio_pool, mode='random', name='web-radio.rand')



In [382]:

    
run_for_dataset(2, radio_pool, mode='flat', name='web-radio.flat')



In [383]:

    
_wradio = compute_weights(threshold=1.4, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w14-06', w=_wradio)



In [395]:

    
_wradio = compute_weights(threshold=1.33, datasets=['web-radio'])
pd.DataFrame([_wradio], columns=heads)









    Out[395]:







  
    
      
      mop
      mop
      mop
      birth_date
      death_date
      casting
      casting
      casting
      function
      function
      function
      genre
      genre
      genre
      key
      key
      key
    
  
  
    
      0
      0.6
      0.6
      1.401
      2.903
      2.093
      0.6
      1.335
      0.6
      0.6
      0.6
      0.6
      1.465
      0.6
      0.6
      0.6
      0.6
      0.6



In [396]:

    
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w133-06', w=_wradio)



In [397]:

    
_wradio = compute_weights(threshold=1.45, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w145-06', w=_wradio)



In [404]:

    
_wradio = compute_weights(threshold=1.3, fallback=.8, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w13-07', w=_wradio)



In [405]:

    
_wradio = compute_weights(threshold=1.2, fallback=.8, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w12-07', w=_wradio)



In [407]:

    
_wradio = compute_weights(threshold=1.1, fallback=.9, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w11-09', w=_wradio)



In [408]:

    
summary_path = os.path.join(out_path, 'summary')
columns = ['name', 'r100', 'r200', 'r500']
summary = pd.DataFrame(columns=columns)

for index, filename in enumerate(sorted(os.listdir(summary_path))):
    table = pd.read_csv(os.path.join(summary_path,filename), sep=';')
    table['r100'] = table.apply(lambda row: row['predicted100']/row['population'], axis=1)
    table['r200'] = table.apply(lambda row: row['predicted200']/row['population'], axis=1)
    table['r500'] = table.apply(lambda row: row['predicted500']/row['population'], axis=1)
    r100 = table['r100'].mean()
    r200 = table['r200'].mean()
    r500 = table['r500'].mean()
    summary.loc[index] = [filename, r100, r200, r500]
summary









    Out[408]:







  
    
      
      name
      r100
      r200
      r500
    
  
  
    
      0
      itema3.flat.csv
      0.294
      0.382
      0.523
    
    
      1
      itema3.rand.csv
      0.064
      0.131
      0.325
    
    
      2
      itema3.w14-06.csv
      0.298
      0.384
      0.531
    
    
      3
      pp.flat.csv
      0.256
      0.351
      0.523
    
    
      4
      pp.rand.csv
      0.091
      0.172
      0.413
    
    
      5
      pp.w2-06.csv
      0.261
      0.355
      0.527
    
    
      6
      spotify.flat.csv
      0.236
      0.365
      0.589
    
    
      7
      spotify.rand.csv
      0.085
      0.170
      0.423
    
    
      8
      spotify.w12-06.csv
      0.234
      0.367
      0.594
    
    
      9
      spotify.w14-06.csv
      0.234
      0.366
      0.594
    
    
      10
      spotify.w15-06.csv
      0.234
      0.366
      0.596
    
    
      11
      web-radio.flat.csv
      0.326
      0.497
      0.700
    
    
      12
      web-radio.rand.csv
      0.110
      0.220
      0.549
    
    
      13
      web-radio.w11-09.csv
      0.321
      0.493
      0.709
    
    
      14
      web-radio.w12-06.csv
      0.321
      0.491
      0.710
    
    
      15
      web-radio.w12-07.csv
      0.323
      0.492
      0.710
    
    
      16
      web-radio.w13-06.csv
      0.321
      0.491
      0.711
    
    
      17
      web-radio.w13-07.csv
      0.323
      0.492
      0.710
    
    
      18
      web-radio.w133-06.csv
      0.320
      0.492
      0.711
    
    
      19
      web-radio.w14-06.csv
      0.320
      0.491
      0.712
    
    
      20
      web-radio.w145-06.csv
      0.320
      0.491
      0.712



In [ ]:

	playlists	tracks	tracks per pl	distinct tracks
pp_concerts	223	1031	4.623	715
itema3_concerts	414	2324	5.614	1109
web-radio	155	2973	19.181	738
spotify_pl	70	3026	43.229	1012

	mop	mop	mop	birth_date	death_date	casting	casting	casting	function	function	function	genre	genre	genre	key	key	key
homongeneous	True	True	True	True	True	True	True	True	False	False	False	True	True	True	True	True	True
% rate	1.53	1.58	1.59	3.9	3.63	1.45	1.42	1.35	0.739	0.748	0.755	1.36	1.63	1.37	1.25	1.09	1.17
std within	3.11e-05	6.14e-06	1.81e-05	0.000964	0.000832	0.000109	4.54e-05	7.99e-05	6.86e-06	6.94e-06	1.06e-05	1.09e-05	6.85e-05	4.38e-05	0.000316	0.000446	0.000454
std between	4.77e-05	9.67e-06	2.87e-05	0.00376	0.00302	0.000158	6.46e-05	0.000108	5.07e-06	5.19e-06	8.01e-06	1.48e-05	0.000112	6.02e-05	0.000395	0.000488	0.00053

	name	r100	r200	r500
0	itema3.flat.csv	0.294	0.382	0.523
1	itema3.rand.csv	0.064	0.131	0.325
2	itema3.w14-06.csv	0.298	0.384	0.531
3	pp.flat.csv	0.256	0.351	0.523
4	pp.rand.csv	0.091	0.172	0.413
5	pp.w2-06.csv	0.261	0.355	0.527
6	spotify.flat.csv	0.236	0.365	0.589
7	spotify.rand.csv	0.085	0.170	0.423
8	spotify.w12-06.csv	0.234	0.367	0.594
9	spotify.w14-06.csv	0.234	0.366	0.594
10	spotify.w15-06.csv	0.234	0.366	0.596
11	web-radio.flat.csv	0.326	0.497	0.700
12	web-radio.rand.csv	0.110	0.220	0.549
13	web-radio.w11-09.csv	0.321	0.493	0.709
14	web-radio.w12-06.csv	0.321	0.491	0.710
15	web-radio.w12-07.csv	0.323	0.492	0.710
16	web-radio.w13-06.csv	0.321	0.491	0.711
17	web-radio.w13-07.csv	0.323	0.492	0.710
18	web-radio.w133-06.csv	0.320	0.492	0.711
19	web-radio.w14-06.csv	0.320	0.491	0.712
20	web-radio.w145-06.csv	0.320	0.491	0.712