Merge tracks with the same song_id


In [1]:
import os, sys, time
import sqlite3
import pickle as pkl
import numpy as np

In [2]:
data_dir = 'data'
fdb = os.path.join(data_dir, 'msd/track_metadata.db')
fsong2track = os.path.join(data_dir, 'msd/songID2TrackIDs.pkl')

Connect the SQLite MSD metadata DB.


In [3]:
conn = sqlite3.connect(fdb)

Get table names.


In [4]:
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
data = res.fetchall()
conn.commit()

In [5]:
data


Out[5]:
[('songs',)]

Get column names.


In [6]:
res = conn.execute('PRAGMA table_info(songs)')
data = res.fetchall()
conn.commit()

In [7]:
data


Out[7]:
[(0, 'track_id', 'text', 0, None, 1),
 (1, 'title', 'text', 0, None, 0),
 (2, 'song_id', 'text', 0, None, 0),
 (3, 'release', 'text', 0, None, 0),
 (4, 'artist_id', 'text', 0, None, 0),
 (5, 'artist_mbid', 'text', 0, None, 0),
 (6, 'artist_name', 'text', 0, None, 0),
 (7, 'duration', 'real', 0, None, 0),
 (8, 'artist_familiarity', 'real', 0, None, 0),
 (9, 'artist_hotttnesss', 'real', 0, None, 0),
 (10, 'year', 'int', 0, None, 0)]

Map song_id to track_id.


In [8]:
res = conn.execute("SELECT track_id, song_id FROM songs")
data = res.fetchall()
conn.commit()

In [9]:
type(data)


Out[9]:
list

In [10]:
len(data)


Out[10]:
1000000

In [11]:
data[0]


Out[11]:
('TRMMMYQ128F932D901', 'SOQMMHC12AB0180CB8')

In [12]:
song2tracks = dict()

In [13]:
for i in range(len(data)):
    tid, sid = data[i]
    try:
        song2tracks[sid].append(tid)
    except KeyError:
        song2tracks[sid] = [tid]
    if (i+1) % 10000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(data)))
        sys.stdout.flush()


1000000 / 1000000

In [14]:
len(song2tracks)


Out[14]:
999056

In [15]:
sorted(song2tracks.items())[:10]


Out[15]:
[('SOAAABI12A8C13615F', ['TRBGKMB128F4257851']),
 ('SOAAABT12AC46860F0', ['TRMTUKT12903CEE7C3']),
 ('SOAAABX12A8C13FEB2', ['TRODQFM128F42AAD47']),
 ('SOAAACR12A58A79456', ['TRDUPEH128F423F4C6']),
 ('SOAAACY12A58A79663', ['TROKBXD128F426BA85']),
 ('SOAAADD12AB018A9DD', ['TRNCENP12903C9EF3A']),
 ('SOAAADE12A6D4F80CC', ['TRSKKFK128F148B615']),
 ('SOAAADF12A8C13DF62', ['TRCQMSP128F428A6F7']),
 ('SOAAADP12A8C1413C7', ['TRAZSAD128F42AF173']),
 ('SOAAADZ12A8C1334FB', ['TRMDNZY128F425A532'])]

In [16]:
ntids = [len(v) for v in song2tracks.values()]

In [17]:
len(ntids)


Out[17]:
999056

In [18]:
max(ntids)


Out[18]:
3

In [19]:
np.sum(ntids)


Out[19]:
1000000

In [20]:
from collections import Counter
counter = Counter(ntids)
counter


Out[20]:
Counter({1: 998160, 2: 848, 3: 48})

Close SQLite DB connection.


In [21]:
conn.close()

Save mapping.


In [22]:
pkl.dump(song2tracks, open(fsong2track, 'wb'))