In [15]:
import numpy as np
from sklearn.decomposition import PCA
import doremus_data

In [2]:
training_data_folder = '/Users/pasquale/git/recommender/training_data'
emb_folder = '/Users/pasquale/git/music-embeddings'

doremus_data.init(training_data_folder, emb_folder)

In [22]:
def compute_avg_dist(what):
    vectors, uris, lbs, head_dim, heads_print = doremus_data.get_embeddings(what)
    pca = PCA(n_components=3)
    pca.fit(vectors)
    vectors = pca.transform(vectors)
    
    population = vectors.shape[0]
    v100 = vectors[np.random.choice(population, min(100, population), replace=False), :]
    v1000 = vectors[np.random.choice(population, min(1000, population), replace=False), :]

    return np.std(vectors, axis=0), np.std(v100, axis=0), np.std(v1000, axis=0)

In [23]:
compute_avg_dist('key')


Out[23]:
(array([0.72756785, 0.69850284, 0.52699083], dtype=float32),
 array([0.72756785, 0.69850284, 0.5269908 ], dtype=float32),
 array([0.72756785, 0.69850284, 0.52699083], dtype=float32))

In [44]:
compute_avg_dist('genre')


Out[44]:
(array([0.05943912, 0.0501507 , 0.04854014], dtype=float32),
 array([0.0646925 , 0.04076198, 0.05613792], dtype=float32),
 array([0.05909543, 0.0485954 , 0.0476206 ], dtype=float32))

In [41]:
compute_avg_dist('mop')


Out[41]:
(array([0.03655686, 0.03464378, 0.03358964], dtype=float32),
 array([0.05754951, 0.0335228 , 0.04727567], dtype=float32),
 array([0.03331411, 0.03489405, 0.03467647], dtype=float32))

In [ ]: