In [5]:
import math
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from scipy import spatial
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity
plt.rcParams["figure.figsize"] = (20,5)
import doremus_data
Load data.
In [6]:
training_data_folder = '/Users/pasquale/git/recommender/training_data'
emb_folder = '/Users/pasquale/git/music-embeddings'
doremus_data.init(training_data_folder, emb_folder)
vectors, uris, lbs, heads, heads_print = doremus_data.get_embeddings('expression')
pd.DataFrame(heads_print)
Out[6]:
In [7]:
all_training = doremus_data.all_training('expression')
def training_stats(t):
num_playlists = len(t['playlists'])
num_track = [len(p['data']) for p in t['playlists']]
distinct = len(np.unique(np.concatenate([p['data'] for p in t['playlists']])))
return num_playlists, np.sum(num_track), np.mean(num_track), distinct
names = [t['name'] for t in all_training]
# for t in all_training:
# num_playlists, num_track = training_stats(t)
# print('%s\t\t%d' % (t['name'], num_playlists))
pd.DataFrame([training_stats(t) for t in all_training], index=names, columns='playlists,tracks,tracks per pl, distinct tracks'.split(','))
Out[7]:
In [8]:
_l = 5
for t in all_training:
temp_playlists = []
for pl in t['playlists']:
for i in np.arange(len(pl['data']) - _l):
temp_playlists.append(pl['data'][i:i+_l])
t['groups'] = [{'name':str(index), 'data': pl} for index, pl in enumerate(temp_playlists)]
print('%s\t\t%d' % (t['name'], len(temp_playlists)))
Data pre-processing
In [9]:
negVector = -2. * np.ones_like(vectors[0], dtype=np.float32)
def get_embs(x, masked=False):
# uri to embedding
v = vectors[np.argwhere(uris == x)]
if v.size == 0:
print(x)
result = np.array(negVector)
else:
result = np.array(v[0][0])
if masked:
result = np.ma.array(result, mask=result < -1.)
return result
def get_label(x):
l = lbs[np.argwhere(uris == x)]
return l[0][0] if l.size > 0 else 'none'
In [10]:
np.set_printoptions(2)
def compute_playlist_stats(playlist, to_print=False):
pl = playlist['data']
embeddings = np.array([get_embs(xi) for xi in pl])
emb_len = len(embeddings[0])
ma_embeddings = np.ma.array(embeddings, mask=embeddings < -1.)
# I do not want to mean dimensions with single values
mul_values = np.where(np.sum(embeddings >= -1., axis=0) > 1, False, True)
mul_values = np.repeat([mul_values], len(pl), axis=0)
ma_embeddings = np.ma.array(ma_embeddings, mask=mul_values)
_mean = ma_embeddings.mean(axis=0)
_median = np.ma.median(ma_embeddings, axis=0)
_min = np.ma.min(ma_embeddings, axis=0)
_max = np.ma.max(ma_embeddings, axis=0)
_std = np.ma.std(ma_embeddings, axis=0)
if to_print:
plt.errorbar(np.arange(len(_mean)), _mean, _std, fmt='ok',
ecolor='black', elinewidth=1.5, lw=3, capsize=2)
plt.errorbar(np.arange(len(_mean)), _mean, [_mean - _min, _max - _mean],
fmt='.k', ecolor='gray', elinewidth=0.5, lw=1, capsize=1)
plt.errorbar(np.arange(len(_mean)), _median, fmt='_g', lw=1)
plt.xticks(range(len(heads)), heads, rotation=40)
plt.show()
return _mean, _median, _std
Sample playlist
In [11]:
base = all_training[3]
print('Base: ' + base['name'])
pl = base['groups'][1023]
print('Playlist: ' + pl['name'])
m, md, s = compute_playlist_stats(pl, True)
for d in pl['data']:
print(d.replace('data', 'overture'))
print(get_label(d))
print(np.ma.array(get_embs(d), mask=get_embs(d) < -1.))
In [ ]:
Standard Deviation among all playlists
In [12]:
population_tot = np.sum([len(pl['data']) for tr in all_training for pl in tr['groups'] ])
std = {}
population = {}
mean = {}
stdw = {} # std within
stdb = {} # std between
for index, tr in enumerate(all_training):
name = tr['name']
std[name] = []
population[name] = []
mean[name] = []
for index, pl in enumerate(tr['groups']):
_mean, _median, _std = compute_playlist_stats(pl, False)
pop = len(pl['data'])
population[name].append(pop)
mean[name].append(_mean)
ww = (pop - 1) / (population_tot - 1)
std[name].append((_std**2)*ww)
stdw[name] = np.ma.sum(std[name], axis=0).filled(0)
In [13]:
weighted_means = [np.ma.mean(mean[name], axis=0)*np.sum(population[name]) for name in mean]
mtot = np.ma.sum(weighted_means, axis=0)
mtot /= population_tot
In [35]:
fig, ax = plt.subplots(figsize=(20,5))
width = 0.2
pos = np.arange(len(vectors[0]))
colors = ['#3668C9', '#DA3B21', '#FD9827', '#1D9424']
for index, tr in enumerate(all_training):
name = tr['name']
ww = np.array([pop / (population_tot - 1) for pop in population[name]])
mg = np.ma.power(np.ma.array(mean[name]).filled(mtot) - mtot, 2)
stdb[name] = np.ma.sum(mg * ww.reshape(len(ww),1), axis=0)
plt.bar([p + index * width for p in pos],
stdb[name],
width,
alpha=0.5,
color=colors[index],
label=name + ' (between)')
plt.bar([p + index * width for p in pos],
stdw[name] ,
width,
edgecolor='#000000',
alpha=.5,
color=colors[index],
label=name + ' (within)')
flat_std = [stdw[name] for name in stdw]
mstd = np.ma.mean(flat_std, axis=0)
sstd = np.ma.std(flat_std, axis=0)
smin = np.ma.min(flat_std, axis=0)
smax = np.ma.max(flat_std, axis=0)
# ax.plot(pos, mstd, '--', label='mean')
heads[-1] = 'date'
plt.xticks(range(len(heads)), heads, rotation=40)
ax.set_ylabel('Variance')
ax.set_xticks([p + 1.5 * width for p in pos])
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc='upper left')
plt.show()
print('Standard Deviation: mean' )
print(np.array(mstd))
print('Standard Deviation: minimum' )
print(np.array(smin))
flat_std_b = [stdb[name] for name in stdb]
mstd_b = np.ma.mean(flat_std_b, axis=0)
print('Standard Deviation between: mean' )
print(np.array(mstd_b))
When the standard deviation within (black border) is smaller then the standard deviation between (no border), this means that for that dimension the values are more homogeneous inside the group than outside. If this difference in homogeneity inside/outside is important, I can state that this dimension drives the playlist generation.
In [15]:
def display_graph(feat):
pos = np.where(np.array(heads) == feat)[0]
pos_slide = np.arange(len(pos))
fig, ax = plt.subplots(figsize=(20,5))
for index, tr in enumerate(all_training):
name = tr['name']
plt.bar([p + index * width for p in pos_slide],
stdb[name][pos],
width,
alpha=0.5,
color=colors[index],
label=name + ' (sdt between)')
plt.bar([p + index * width for p in pos_slide],
stdw[name][pos] ,
width,
alpha=0.5,
edgecolor='#000000',
color=colors[index],
label=name + ' (sdt within)')
ax.plot(pos_slide, mstd[pos], '--', label='mean')
plt.xticks(pos_slide, np.array(heads)[pos], rotation=40)
ax.set_ylabel('Standard Deviation')
ax.set_xticks([p + 1.5 * width for p in pos_slide])
# handles, labels = ax.get_legend_handles_labels()
# ax.legend(handles, labels, loc='upper left')
plt.show()
for _f in heads_print[0]:
display_graph(_f)
For concerts
In [16]:
flat_std = [s for name in ['pp_concerts','itema3_concerts'] for s in std[name]]
mstd = np.ma.mean(flat_std, axis=0)
sstd = np.ma.std(flat_std, axis=0)
smin = np.ma.min(flat_std, axis=0)
smax = np.ma.max(flat_std, axis=0)
print('Standard Deviation: mean' )
print(np.array(mstd))
print('Standard Deviation: minimum' )
print(np.array(smin))
print('Standard Deviation: maximum' )
print(np.array(smax))
For playlists
In [17]:
flat_std = [s for name in ['web-radio','spotify_pl'] for s in std[name]]
mstd = np.ma.mean(flat_std, axis=0)
sstd = np.ma.std(flat_std, axis=0)
smin = np.ma.min(flat_std, axis=0)
smax = np.ma.max(flat_std, axis=0)
print('Standard Deviation: mean' )
print(np.array(mstd))
print('Standard Deviation: minimum' )
print(np.array(smin))
print('Standard Deviation: maximum' )
print(np.array(smax))
In [ ]:
In [18]:
def get_std_gap(_chosen):
# return (stdb[_chosen] - stdw[_chosen]) / ((stdw[_chosen] + stdb[_chosen]) / 2 )
return stdb[_chosen] / stdw[_chosen]
def display_distances(_chosen):
better =(stdw[_chosen]<=stdb[_chosen]).tolist()
distance = get_std_gap(_chosen)
pd.set_option('precision', 3)
return pd.DataFrame([better, distance.tolist(), stdw[_chosen].tolist(), stdb[_chosen].tolist()],
index=['homongeneous', '% rate', 'std within', 'std between'], columns=heads)
In [19]:
display_distances('spotify_pl')
Out[19]:
In [20]:
display_distances('web-radio')
Out[20]:
In [21]:
display_distances('pp_concerts')
Out[21]:
In [22]:
display_distances('itema3_concerts')
Out[22]:
2 different tendences between concerts and playlists.
In concerts all the dimensions (where they exist) are more homogeneous. This is true in particular for the casting (not to be used instead for playlists), and this is reasonable.
The 1st dimension of composer is also one to take in account. In Itema3 this is not visible probably because of bad interlinking of artists.
For the keys, the values are not so relevant and not stable in positiveness.
Always positive (in bold the largely ones):
Positive only in concerts:
In [23]:
def compute_weights(threshold=1.3, fallback = .6, datasets =['spotify_pl']):
dist = [get_std_gap(_chosen) for _chosen in datasets]
dist = np.ma.mean(dist, axis=0).filled()
return np.where(dist > threshold, dist, fallback)
w = compute_weights()
pd.DataFrame([w], columns=heads)
Out[23]:
In [24]:
def get_pool_from_datasets(datasets=['web-radio','spotify_pl']):
all_song = []
for t in all_training:
if t['name'] in datasets:
for pl in t['playlists']:
all_song = np.concatenate([all_song, pl['data']])
all_song = np.unique(all_song)
print('Pool size: %d' % len(all_song))
all_song_vec = np.ma.array([get_embs(xi, masked=True) for xi in all_song])
# all_song_vec = np.ma.array(all_song_vec, mask=all_song_vec < -1.)
all_song_labels = np.array([get_label(xi) for xi in all_song])
return all_song, all_song_vec, all_song_labels
In [25]:
def computeSimilarity(seed, target, w):
b1 = np.where(seed.mask==True)[0]
b2 = np.where(target.mask==True)[0]
bad_pos = np.unique(np.concatenate([b1, b2]))
_seed = np.delete(seed, bad_pos, axis=0)
_target = np.delete(target, bad_pos, axis=0)
_w = np.delete(w, bad_pos, axis=0)
if len(_seed) == 0:
return 0
# distance
d = weightedL2(_seed, _target, _w)
# how much info I am not finding
penalty = len([x for x in b2 if x not in b1]) / len(seed)
# score
s = (max_distance - d) / max_distance
return s * (1 - penalty)
def weightedL2(a, b, w=1):
# return distance.cosine(a,b)
# https://stackoverflow.com/a/8861999/1218213
q = a - b
return np.sqrt((w * q * q).sum())
# return (w * q * q).sum()
_ones = np.ones(vectors[0].shape)
max_distance = weightedL2(_ones,-_ones, _ones)
In [26]:
def find(seed, n=4, w=None, _print=True, pool=get_pool_from_datasets()):
global max_distance
_uris = pool[0]
_vectors = pool[1]
_lbs = pool[2]
f_length = len(seed)
_seed = seed
if w is None:
w = np.ones(len(_seed))
w = w / w.sum()
else:
w = np.array(w)
# temp = [np.ones(f_length[k]) * w[k] for k in range(len(w))]
# w = np.array([item for sublist in temp for item in sublist])
max_distance = weightedL2(np.ones(len(_seed)), np.ones(len(_seed)) * -1, w)
if _print==True: print('computing scores')
scores = np.array([[computeSimilarity(_seed, x.astype(float), w) for x in _vectors]])
full = np.concatenate([_uris.reshape(len(_uris), 1), scores.transpose(), _lbs.reshape(len(_uris), 1)], axis=1)
# remove the seed from the list
# full = np.delete(full, pos, 0)
# sort
full_sorted = sorted(full, key=lambda _x: float(_x[1]), reverse=True)
most_similar = full_sorted[:n]
if _print==True: print('\n'.join('%s %s\n%s' % (f[0], f[1], f[2]) for f in most_similar))
return [{'uri': _a[0], 'score': float(_a[1])} for _a in most_similar]
In [33]:
find(get_embs('http://data.doremus.org/expression/edfbf89b-1464-32d5-b0e0-3b8db4d80ef1', masked=True))
Out[33]:
In [28]:
find(get_embs('http://data.doremus.org/expression/73e749e6-b727-3bfb-bcea-d895c86ec46c', masked=True))
Out[28]:
In [34]:
find(get_embs('http://data.doremus.org/expression/6dd59267-fef0-392d-911f-7abbe676e289', masked=True))
Out[34]:
In [46]:
def recommend_compare(playlist, w, pos=-1, num_candidates=[100, 200, 500],
verbose=True, pool=get_pool_from_datasets(), overture=True):
pl_data = playlist['data']
pl_population = len(pl_data)
if verbose: print('%d items | %s' % (pl_population, playlist['name'].split('/')[-1]))
_replcm = 'overture.' if overture else 'data.'
if pos < 0:
pos = random.randrange(pl_population)
chosen = pl_data[pos]
targets = pl_data
# [max(pos-7,0):min(pos+7, len(pl_data))]
targets_pop = len(targets)
# print(max(pos-5,0))
# print(min(pos+5, len(pl_data)))
# print(targets_pop)
if verbose:
print('seed: %d) %s' % (pos, get_label(chosen)))
print('\t '+ chosen.replace('data.', _replcm))
first = get_embs(chosen, masked=True)
candidates = find(first, n=np.max(num_candidates), _print=False, pool=pool )
candidates_2 = find(first, n=np.max(num_candidates), _print=False, w = w, pool=pool )
results = np.zeros((len(num_candidates), 3))
for qi, q in enumerate(num_candidates):
trues_flat = 0
for index, c in enumerate(candidates[:q]):
if c['uri'] == chosen: continue
if c['uri'] in targets: trues_flat+=1
# if verbose:
# _sig = ' X ' if c['uri'] in targets else ' '
# print('%d \t %.5f'% (index, c['score']) + '\t' + _sig + '\t' + get_label(c['uri']))
# print('\t\t\t\t'+ c['uri'].replace('data.', 'overture.'))
# display(pd.DataFrame(np.ma.array( candidates[c]).reshape(1, 13)))
trues = 0
for index, c in enumerate(candidates_2[:q]):
if c['uri'] == chosen: continue
if c['uri'] in targets: trues+=1
# if verbose:
# _sig = ' X ' if c['uri'] in pl_data else ' '
# print('%d \t %.5f'% (index, c['score']) + '\t' + _sig + '\t' + get_label(c['uri']))
# print('\t\t\t\t'+ c['uri'].replace('data.', 'overture.'))
# # display(pd.DataFrame(np.ma.array( candidates[c]).reshape(1, 13)))
if verbose: print('%d | flat %d | weighted %d | diff %d' % (q, trues_flat, trues, trues-trues_flat))
results[qi] = [trues / targets_pop, trues_flat / targets_pop, (trues-trues_flat) / targets_pop]
return results
In [47]:
pl = all_training[3]['playlists'][55]
recommend_compare(pl, w, 9)
Out[47]:
In [48]:
out_path = './out'
def ensure_dir(file_path):
directory = os.path.dirname(file_path)
if not os.path.exists(directory):
os.makedirs(directory)
return file_path
from tqdm import tqdm_notebook as tqdm
In [49]:
def test_recommendation(pool, playlist, pos=-1, mode='random', w=None, name='', verbose=0, overture=False, write=False):
pl_data = playlist['data']
pl_population = len(pl_data)
pl_name = playlist['name'].split('/')[-1].replace('.json', '')
_replcm = 'overture.' if overture else 'data.'
if pos < 0:
pos = random.randrange(pl_population)
chosen = pl_data[pos]
pl_data = np.delete(pl_data, pos, axis=0)
if verbose > 0:
print('%d items | %s' % (pl_population, pl_name))
print('seed: %d) %s' % (pos, get_label(chosen)))
print('\t '+ chosen.replace('data.', _replcm))
first = get_embs(chosen, masked=True)
num_candidates=[100, 200, 500]
max_candidates = np.max(num_candidates)
if mode == 'flat':
candidates = find(first, n=max_candidates, _print=False, pool=pool)
elif mode == 'weighted':
candidates = find(first, n=max_candidates, _print=False, w = w, pool=pool)
else : # random
candidates = list(map(lambda x: {'uri': x, 'score': 0}, random.sample(pool[0].tolist(), 500)))
results = np.zeros(len(num_candidates))
candidates_uri = list(map(lambda x: x['uri'], candidates))
for qi, q in enumerate(num_candidates):
trues = len(set(candidates_uri[:q]).intersection(set(pl_data)))
if verbose > 0:
print('%d | positive %d | population %d' % (q, trues, pl_population))
results[qi] = trues
if verbose > 1:
for index, c in enumerate(candidates[:max_candidates]):
_sig = ' X ' if c['uri'] in pl_data else ' '
print('%d \t %.5f'% (index, c['score']) + '\t' + _sig + '\t' + get_label(c['uri']))
print('\t\t\t\t'+ c['uri'].replace('data.', _replcm))
# display(pd.DataFrame(np.ma.array( candidates[c]).reshape(1, 13)))
if write:
to_write =[';'.join([
str(index),
str(c['score']),
'1' if c['uri'] in pl_data else '0',
get_label(c['uri']),
c['uri'].replace('[;\n"]', ' ')
])
for index, c in enumerate(candidates[:max_candidates])]
filename = pl_name + '.' + str(pos) + '.csv'
with open(ensure_dir(os.path.join(out_path, 'detail', name, filename)), 'w') as file:
file.write('index;score;predicted;label;uri\n')
file.write('\n'.join(to_write))
return results
In [50]:
def run_for_dataset(id_dataset, pool, mode='random', w=None, name=''):
with open(ensure_dir(os.path.join(out_path, 'summary', name + '.csv')), 'w') as file:
file.write('index;playlist;population;predicted100;predicted200;predicted500\n')
testset = all_training[id_dataset]['playlists']
pbar = tqdm(total=len(testset))
for index, pl in enumerate(testset):
population = len(pl['data'])
pl_name = pl['name'].split('/')[-1].replace('.json', '').replace('"','')
results = [test_recommendation(pool=pool, playlist=pl, pos=pos,
mode=mode, w=w, write=False, name=name)
for pos, work in enumerate(pl['data'])]
results = np.mean(results,axis=0)
file.write(';'.join([str(index), pl_name, str(population),
str(results[0]), str(results[1]), str(results[2])]))
file.write('\n')
pbar.update(1)
pbar.close()
In [423]:
# test_recommendation(pp_pool, playlist=all_training[0]['playlists'][4], mode='weighted', name='pp.w5-06', w=_wpp, verbose=2 )
In [363]:
# itema3_pool = get_pool_from_datasets(['itema3_concerts'])
In [369]:
run_for_dataset(1, itema3_pool, mode='random', name='itema3.rand')
In [371]:
run_for_dataset(1, itema3_pool, mode='flat', name='itema3.flat')
In [378]:
_wi3 = compute_weights(threshold=5, datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w5-06', w= _wi3)
In [437]:
_wi3 = compute_weights(threshold=8, datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w8-06', w= _wi3)
In [444]:
_wi3 = compute_weights(threshold=10, datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w10-06', w= _wi3)
In [445]:
_wi3 = compute_weights(threshold=8, fallback=1., datasets=['itema3_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.w8-1', w= _wi3)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
_wi3 = compute_weights(threshold=5, datasets=['itema3_concerts', 'pp_concerts'])
run_for_dataset(1, itema3_pool, mode='weighted', name='itema3.wp5-06', w= _wi3)
In [ ]:
pp_pool = get_pool_from_datasets(['pp_concerts'])
In [ ]:
run_for_dataset(0, pp_pool, mode='random', name='pp.rand')
In [ ]:
run_for_dataset(0, pp_pool, mode='flat', name='pp.flat')
In [ ]:
_wpp = compute_weights(threshold=5, datasets=['pp_concerts'])
# pd.DataFrame([_wpp], columns=heads)
In [ ]:
run_for_dataset(0, pp_pool, mode='weighted', name='pp.w5-06', w=_wpp)
In [ ]:
spo_pool = get_pool_from_datasets(['spotify_pl'])
In [ ]:
run_for_dataset(3, spo_pool, mode='random', name='spotify.rand')
In [ ]:
run_for_dataset(3, spo_pool, mode='flat', name='spotify.flat')
In [ ]:
_wspo = compute_weights(threshold=1.3, datasets=['spotify_pl'])
run_for_dataset(3, spo_pool, mode='weighted', name='spotify.w13-06', w=_wspo)
In [54]:
radio_pool = get_pool_from_datasets(['web-radio'])
In [ ]:
run_for_dataset(2, radio_pool, mode='random', name='web-radio.rand')
In [ ]:
run_for_dataset(2, radio_pool, mode='flat', name='web-radio.flat')
In [ ]:
_wradio = compute_weights(threshold=1.4, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w14-06', w=_wradio)
In [59]:
_wradio = compute_weights(threshold=1.4, datasets=['web-radio'])
_wradio[-1] *= 2
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.wd14-06', w=_wradio)
In [56]:
_wradio = compute_weights(threshold=1.5, datasets=['web-radio'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.w15-06', w=_wradio)
In [ ]:
_wradio = compute_weights(threshold=1.4, datasets=['web-radio', 'spotify_pl'])
run_for_dataset(2, radio_pool, mode='weighted', name='web-radio.ws14-06', w=_wradio)
In [ ]:
In [61]:
summary_path = os.path.join(out_path, 'summary')
columns = ['name', 'r100', 'r200', 'r500']
summary = pd.DataFrame(columns=columns)
for index, filename in enumerate(sorted(os.listdir(summary_path))):
table = pd.read_csv(os.path.join(summary_path,filename), sep=';')
table['r100'] = table.apply(lambda row: row['predicted100']/row['population'], axis=1)
table['r200'] = table.apply(lambda row: row['predicted200']/row['population'], axis=1)
table['r500'] = table.apply(lambda row: row['predicted500']/row['population'], axis=1)
r100 = table['r100'].mean()
r200 = table['r200'].mean()
r500 = table['r500'].mean()
summary.loc[index] = [filename, r100, r200, r500]
summary
Out[61]:
In [ ]:
rc = {}
quantities = [100, 200, 500]
_headers = np.array([['weighted %d' % q, 'flat %d' % q, 'diff %d' % q] for q in quantities]).flatten()
def compute_for_dataset(dataset, w):
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for pl, idxp in log_progress(playlists, name=dataset['name']):
rcomp = [recommend_compare(pl, w, elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
current[idxp] = np.mean(rcomp, axis=0)
return current
In [ ]:
def evaluate(w, toPrint=False):
for dataset, idxd in log_progress(all_training[2:4], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset, w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
if toPrint:
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
else:
return _data
In [ ]:
evaluate(w, True)
I try with different values
In [ ]:
def try_with(threshold):
w = compute_weights(threshold=threshold)
for dataset, idxd in log_progress(all_training[2:4], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset,w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
In [ ]:
from IPython.display import display, HTML
for dataset in all_training[2:4]:
playlists = dataset['playlists']
for pl in playlists:
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
missing = [len(np.where(get_embs(item) < -1.)[0]) for item in pl['data']]
_counts = sorted(pd.Series(missing).value_counts().reset_index().values.tolist())
_pt = ['%d (%d)' % (item[0], item[1]) for item in _counts]
print('\t\t\t'+'\t'.join(_pt))
In [ ]:
try_with(1.4)
In [ ]:
try_with(1.5)
In [ ]:
try_with(2.)
In [ ]:
try_with(1.2)
In [ ]:
try_with(1.1)
In [ ]:
try_with(1.)
In [ ]:
for dataset in all_training[3:4]:
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for idxp, pl in enumerate(playlists):
rcomp = [recommend_compare(pl, w, elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
print(len(pl['data']))
current[idxp] = np.mean(rcomp, axis=0)
_c = current[idxp]
print('%.3f | %.3f | %.3f' %(_c[2],_c[5], _c[8]))
print(dataset['name'])
print(np.mean(current, axis=0))
print("==========================")
In [ ]:
0.00030769230769230765 | 0.007507692307692307 | 0.013246153846153847
In [ ]:
for dataset in all_training[2:3]:
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for idxp, pl in enumerate(playlists):
rcomp = [recommend_compare(pl, w, elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
print(len(pl['data']))
current[idxp] = np.mean(rcomp, axis=0)
_c = current[idxp]
print('%.3f | %.3f | %.3f' %(_c[2],_c[5], _c[8]))
print(dataset['name'])
print(np.mean(current, axis=0))
print("==========================")
In [ ]:
0.00006349206349206348 | 0.0015492063492063492 | 0.0027333333333333333
In [ ]:
for dataset in all_training[3:4]:
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for idxp, pl in enumerate(playlists):
rcomp = [recommend_compare(pl, compute_weights(threshold=1.4), elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
print(len(pl['data']))
current[idxp] = np.mean(rcomp, axis=0)
_c = current[idxp]
print('%.3f | %.3f | %.3f' %(_c[2],_c[5], _c[8]))
print(dataset['name'])
print(np.mean(current, axis=0))
print("==========================")
In [ ]:
for dataset in all_training[2:3]:
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for idxp, pl in enumerate(playlists):
rcomp = [recommend_compare(pl, compute_weights(threshold=1.4), elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
print(len(pl['data']))
current[idxp] = np.mean(rcomp, axis=0)
_c = current[idxp]
print('%.3f | %.3f | %.3f' %(_c[2],_c[5], _c[8]))
print(dataset['name'])
print(np.mean(current, axis=0))
print("==========================")
In [ ]:
for dataset in all_training[3:4]:
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for idxp, pl in enumerate(playlists):
rcomp = [recommend_compare(pl, compute_weights(threshold=1.5), elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
print(len(pl['data']))
current[idxp] = np.mean(rcomp, axis=0)
_c = current[idxp]
print('%.3f | %.3f | %.3f' %(_c[2],_c[5], _c[8]))
print(dataset['name'])
print(np.mean(current, axis=0))
print("==========================")
In [ ]:
for dataset in all_training[2:3]:
playlists = dataset['playlists']
current = np.zeros((len(playlists), 3*len(quantities)))
for idxp, pl in enumerate(playlists):
rcomp = [recommend_compare(pl, compute_weights(threshold=1.5), elem, num_candidates=quantities, verbose=False).flatten()
for elem in np.arange(len(pl['data']))]
print(pl['name'].split('/')[-1].replace('.expression.txt', ''))
print(len(pl['data']))
current[idxp] = np.mean(rcomp, axis=0)
_c = current[idxp]
print('%.3f | %.3f | %.3f' %(_c[2],_c[5], _c[8]))
print(dataset['name'])
print(np.mean(current, axis=0))
print("==========================")
In [ ]:
def display_playlist(name, virtuoso=True):
for base in all_training:
for index, pl in enumerate(base['playlists']):
_p = pl['name'].split('/')[-1].replace('.expression.txt', '').replace('.json', '')
if _p == name:
print('Playlist %d: %s' % (index, pl['name']))
m, md, s = compute_playlist_stats(pl, True)
for d in pl['data']:
if virtuoso :
print(d)
else :
print(d.replace('data', 'overture'))
print(get_label(d))
print(np.ma.array(get_embs(d), mask=get_embs(d) < -1.))
return
In [ ]:
display_playlist('37i9dQZF1DXaSipEWiHbyL.Classical World: USA')
In [ ]:
display_playlist('FM-401_20171030_00-23')
In [ ]:
pl = all_training[3]['playlists'][63]
recommend_compare(pl, w, 15)
In [ ]:
w = compute_weights(datasets=['spotify_pl', 'web-radio'])
pd.DataFrame([w], columns=heads)
In [ ]:
def try_with(threshold):
w = compute_weights(threshold=threshold, datasets=['spotify_pl', 'web-radio'])
for dataset, idxd in log_progress(all_training[2:4], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset,w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
In [ ]:
try_with(1.3)
In [ ]:
try_with(1.5)
In [ ]:
def try_with(threshold):
w = compute_weights(threshold=threshold, datasets=['web-radio'])
for dataset, idxd in log_progress(all_training[2:4], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset,w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
In [ ]:
try_with(1.4)
In [ ]:
try_with(1.5)
In [ ]:
def try_with(threshold):
w = compute_weights(threshold=threshold, datasets=['pp_concerts', 'itema3_concerts'])
for dataset, idxd in log_progress(all_training[2:4], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset,w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
In [ ]:
try_with(3.)
In [ ]:
try_with(5.)
In [ ]:
try_with(8.)
In [ ]:
def try_with(threshold):
w = compute_weights(threshold=threshold, datasets=['pp_concerts', 'itema3_concerts'])
for dataset, idxd in log_progress(all_training[0:2], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset,w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
In [ ]:
try_with(3.)
In [ ]:
try_with(5.)
In [ ]:
try_with(8.)
In [ ]:
def try_with(threshold):
w = compute_weights(threshold=threshold, datasets=['itema3_concerts'])
for dataset, idxd in log_progress(all_training[0:2], name='datasets'):
rc[dataset['name']] = compute_for_dataset(dataset,w)
_data = np.array([np.mean(rc[ds], axis=0) for ds in rc])
return pd.DataFrame(
_data,
index = [ds for ds in rc],
columns = _headers
)
In [ ]:
try_with(5.)
In [ ]:
try_with(10.)
In [ ]:
pl = all_training[0]['playlists'][18]
recommend_compare(pl, w, 5, verbose=True, pool=get_pool_from_datasets(['pp_concerts']))
In [ ]:
display_playlist('163468dd-41cc-3818-a874-c867959fe603', virtuoso=False)
In [ ]:
_seed = get_embs('http://data.doremus.org/expression/79456370-2eb3-3abe-91d6-326bc59180a5', masked=True)
_target = get_embs('http://data.doremus.org/expression/2dec45a4-545a-352d-b974-d5b5d3fd6a69', masked=True)
computeSimilarity(_seed, _target, w)
In [ ]:
all_f = find(_seed, _print=False, n=3000)
In [ ]:
for a in all_f:
if a['uri'] == 'http://data.doremus.org/expression/2dec45a4-545a-352d-b974-d5b5d3fd6a69':
print('found')
In [ ]:
for a in all_song:
if a == 'http://data.doremus.org/expression/2dec45a4-545a-352d-b974-d5b5d3fd6a69':
print('found')
In [ ]:
display_playlist('0a2e8bab-a762-3e13-8a1e-236c31976b75', virtuoso=True)
In [ ]:
display_playlist('37i9dQZF1DXaSipEWiHbyL.Classical World: USA')
In [ ]: