In [1]:
import os, sys, time, gzip
import sqlite3
import pickle as pkl
import numpy as np
from collections import Counter
In [2]:
data_dir = 'data'
fdb = os.path.join(data_dir, 'msd/track_metadata.db')
fsong2track = os.path.join(data_dir, 'msd/song2tracks.pkl.gz')
fsong2artist = os.path.join(data_dir, 'msd/song2artist.pkl.gz')
Connect the SQLite MSD metadata DB.
In [3]:
conn = sqlite3.connect(fdb)
Get table names.
In [4]:
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
data = res.fetchall()
conn.commit()
In [5]:
data
Out[5]:
Get column names.
In [6]:
res = conn.execute('PRAGMA table_info(songs)')
data = res.fetchall()
conn.commit()
In [7]:
data
Out[7]:
In [ ]:
song2artist = dict()
In [ ]:
res = conn.execute("SELECT song_id, artist_mbid FROM songs")
data = res.fetchall()
conn.commit()
In [ ]:
#type(data)
In [ ]:
#data[0]
In [ ]:
#data[100]
In [ ]:
#type(data[0][1])
In [ ]:
for i in range(len(data)):
sid, aid = data[i]
if len(aid.strip()) < 1: continue
song2artist[sid] = aid
if (i+1) % 10000 == 0:
sys.stdout.write('\r%d / %d' % (i+1, len(data)))
sys.stdout.flush()
In [ ]:
len(song2artist)
In [ ]:
len(set(song2artist.values()))
In [ ]:
pkl.dump(song2artist, gzip.open(fsong2artist, 'wb'))
In [ ]:
res = conn.execute("SELECT track_id, song_id FROM songs")
data = res.fetchall()
conn.commit()
In [ ]:
type(data)
In [ ]:
len(data)
In [ ]:
data[0]
In [ ]:
song2tracks = dict()
In [ ]:
for i in range(len(data)):
tid, sid = data[i]
try:
song2tracks[sid].append(tid)
except KeyError:
song2tracks[sid] = [tid]
if (i+1) % 10000 == 0:
sys.stdout.write('\r%d / %d' % (i+1, len(data)))
sys.stdout.flush()
In [ ]:
len(song2tracks)
In [ ]:
sorted(song2tracks.items())[:10]
In [ ]:
ntids = [len(v) for v in song2tracks.values()]
In [ ]:
len(ntids)
In [ ]:
max(ntids)
In [ ]:
np.sum(ntids)
In [ ]:
counter = Counter(ntids)
counter
Close SQLite DB connection.
In [ ]:
conn.close()
Save mapping.
In [ ]:
pkl.dump(song2tracks, gzip.open(fsong2track, 'wb'))