In [1]:
import os, sys, time
import sqlite3
import pickle as pkl
import numpy as np
In [2]:
data_dir = 'data'
fdb = os.path.join(data_dir, 'msd/track_metadata.db')
fsong2track = os.path.join(data_dir, 'msd/songID2TrackIDs.pkl')
Connect the SQLite MSD metadata DB.
In [3]:
conn = sqlite3.connect(fdb)
Get table names.
In [4]:
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
data = res.fetchall()
conn.commit()
In [5]:
data
Out[5]:
Get column names.
In [6]:
res = conn.execute('PRAGMA table_info(songs)')
data = res.fetchall()
conn.commit()
In [7]:
data
Out[7]:
Map song_id
to track_id
.
In [8]:
res = conn.execute("SELECT track_id, song_id FROM songs")
data = res.fetchall()
conn.commit()
In [9]:
type(data)
Out[9]:
In [10]:
len(data)
Out[10]:
In [11]:
data[0]
Out[11]:
In [12]:
song2tracks = dict()
In [13]:
for i in range(len(data)):
tid, sid = data[i]
try:
song2tracks[sid].append(tid)
except KeyError:
song2tracks[sid] = [tid]
if (i+1) % 10000 == 0:
sys.stdout.write('\r%d / %d' % (i+1, len(data)))
sys.stdout.flush()
In [14]:
len(song2tracks)
Out[14]:
In [15]:
sorted(song2tracks.items())[:10]
Out[15]:
In [16]:
ntids = [len(v) for v in song2tracks.values()]
In [17]:
len(ntids)
Out[17]:
In [18]:
max(ntids)
Out[18]:
In [19]:
np.sum(ntids)
Out[19]:
In [20]:
from collections import Counter
counter = Counter(ntids)
counter
Out[20]:
Close SQLite DB connection.
In [21]:
conn.close()
Save mapping.
In [22]:
pkl.dump(song2tracks, open(fsong2track, 'wb'))