Learning artist feature using word2vec


In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time, gzip
import pickle as pkl
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
datasets = ['aotm2011', '30music']

In [ ]:
dix = 1
dataset_name = datasets[dix]
dataset_name

In [ ]:
data_dir = 'data/%s/setting2' % dataset_name
playlists2 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_dev_test_s2_1.pkl.gz'), 'rb'))
song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))

In [ ]:
playlists4train = playlists2['train_playlists'] + playlists2['dev_playlists']
len(playlists4train)

In [ ]:
artist_playlist = []

for pl, _ in playlists4train:
    pl_artists = [song2artist[sid] if sid in song2artist else '$UNK$' for sid in pl]
    artist_playlist.append(pl_artists)

In [ ]:
len(artist_playlist)

In [ ]:
fartist_playlist = 'data/%s/setting2/artist_seq_playlist.txt' % dataset_name

In [ ]:
with open(fartist_playlist, 'w') as fd:
    for pl_artists in artist_playlist:
        fd.write(' '.join(pl_artists) + '\n')

In [ ]:
fartist2vec_bin = 'data/%s/setting2/artist2vec.bin' % dataset_name

In [ ]:
!word2vec/bin/word2vec

In [ ]:
!word2vec/bin/word2vec -train $fartist_playlist -output $fartist2vec_bin -binary 1 -window 10 -size 50 -iter 10

In [ ]:
import gensim

#artist2vec = gensim.models.Word2Vec.load(fartist2vec_bin)
artist2vec = gensim.models.KeyedVectors.load_word2vec_format(fartist2vec_bin, binary=True)

In [ ]:
aid = artist_playlist[0][3]
artist2vec.get_vector(aid)

In [ ]: