Map song/artist to tracks


In [1]:
import os, sys, time, gzip
import sqlite3
import pickle as pkl
import numpy as np
from collections import Counter

In [2]:
data_dir = 'data'
fdb = os.path.join(data_dir, 'msd/track_metadata.db')
fsong2track = os.path.join(data_dir, 'msd/song2tracks.pkl.gz')
fsong2artist = os.path.join(data_dir, 'msd/song2artist.pkl.gz')

Connect the SQLite MSD metadata DB.


In [3]:
conn = sqlite3.connect(fdb)

Get table names.


In [4]:
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
data = res.fetchall()
conn.commit()

In [5]:
data


Out[5]:
[('songs',)]

Get column names.


In [6]:
res = conn.execute('PRAGMA table_info(songs)')
data = res.fetchall()
conn.commit()

In [7]:
data


Out[7]:
[(0, 'track_id', 'text', 0, None, 1),
 (1, 'title', 'text', 0, None, 0),
 (2, 'song_id', 'text', 0, None, 0),
 (3, 'release', 'text', 0, None, 0),
 (4, 'artist_id', 'text', 0, None, 0),
 (5, 'artist_mbid', 'text', 0, None, 0),
 (6, 'artist_name', 'text', 0, None, 0),
 (7, 'duration', 'real', 0, None, 0),
 (8, 'artist_familiarity', 'real', 0, None, 0),
 (9, 'artist_hotttnesss', 'real', 0, None, 0),
 (10, 'year', 'int', 0, None, 0)]

Map song_id to artist_mbid


In [ ]:
song2artist = dict()

In [ ]:
res = conn.execute("SELECT song_id, artist_mbid FROM songs")
data = res.fetchall()
conn.commit()

In [ ]:
#type(data)

In [ ]:
#data[0]

In [ ]:
#data[100]

In [ ]:
#type(data[0][1])

In [ ]:
for i in range(len(data)):
    sid, aid = data[i]
    if len(aid.strip()) < 1: continue
    song2artist[sid] = aid
    if (i+1) % 10000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(data)))
        sys.stdout.flush()

In [ ]:
len(song2artist)

In [ ]:
len(set(song2artist.values()))

In [ ]:
pkl.dump(song2artist, gzip.open(fsong2artist, 'wb'))

Map song_id to a list of track_id


In [ ]:
res = conn.execute("SELECT track_id, song_id FROM songs")
data = res.fetchall()
conn.commit()

In [ ]:
type(data)

In [ ]:
len(data)

In [ ]:
data[0]

In [ ]:
song2tracks = dict()

In [ ]:
for i in range(len(data)):
    tid, sid = data[i]
    try:
        song2tracks[sid].append(tid)
    except KeyError:
        song2tracks[sid] = [tid]
    if (i+1) % 10000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(data)))
        sys.stdout.flush()

In [ ]:
len(song2tracks)

In [ ]:
sorted(song2tracks.items())[:10]

In [ ]:
ntids = [len(v) for v in song2tracks.values()]

In [ ]:
len(ntids)

In [ ]:
max(ntids)

In [ ]:
np.sum(ntids)

In [ ]:
counter = Counter(ntids)
counter

Close SQLite DB connection.


In [ ]:
conn.close()

Save mapping.


In [ ]:
pkl.dump(song2tracks, gzip.open(fsong2track, 'wb'))