In [1]:
%matplotlib inline
In [2]:
import json
import os
import sqlite3
import sys
import pickle
import numpy as np
import pandas as pd
In [3]:
TP_file = 'train_triplets.txt'
md_dbfile = 'track_metadata.db'
In [4]:
tp = pd.read_table(TP_file, header=None, names=['uid', 'sid', 'count'])
In [5]:
MIN_USER_COUNT = 20
MIN_SONG_COUNT = 50
In [7]:
# tid2sid.json contains a mapping between track id and song id, which can obtained from track_metadata.db
with open('tid2sid.json', 'r') as f:
tid2sid = json.load(f)
In [8]:
bad_audio = []
with open('tracks_bad_audio.txt', 'r') as f:
for line in f:
bad_audio.append(line.strip())
In [9]:
bad_sid = [tid2sid[k] for k in bad_audio]
In [10]:
def filter_usable_tracks(tp, bad_sid):
return tp[~tp['sid'].isin(bad_sid)]
tp_good = filter_usable_tracks(tp, bad_sid)
In [11]:
print '%d playcount triplets are kept out of %d'% (len(tp_good), len(tp))
In [12]:
def get_count(tp, id):
playcount_groupbyid = tp[[id, 'count']].groupby(id, as_index=False)
count = playcount_groupbyid.size()
return count
def remove_inactive(tp, min_uc=MIN_USER_COUNT, min_sc=MIN_SONG_COUNT):
# Only keep the triplets for songs which were listened to by at least min_sc users.
songcount = get_count(tp, 'sid')
tp = tp[tp['sid'].isin(songcount.index[songcount >= min_sc])]
# Only keep the triplets for users who listened to at least min_uc songs
# After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
usercount = get_count(tp, 'uid')
tp = tp[tp['uid'].isin(usercount.index[usercount >= min_uc])]
# Update both usercount and songcount after filtering
usercount, songcount = get_count(tp, 'uid'), get_count(tp, 'sid')
return tp, usercount, songcount
In [13]:
tp, usercount, songcount = remove_inactive(tp_good)
In [14]:
sparsity_level = float(tp.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After filtering, there are %d triplets from %d users and %d songs (sparsity level %.3f%%)" % (tp.shape[0],
usercount.shape[0],
songcount.shape[0],
sparsity_level * 100)
In [15]:
usercount.hist(bins=100)
Out[15]:
In [16]:
songcount.hist(bins=100)
Out[16]:
In [17]:
songcount.sort(ascending=False)
In [18]:
def get_song_info_from_sid(conn, sid):
cur = conn.cursor()
cur.execute("SELECT title, artist_name FROM songs WHERE song_id = '%s'" % (sid))
title, artist = cur.fetchone()
return title, artist
In [19]:
# take a look at the top 50 most listened songs
with sqlite3.connect(os.path.join(MSD_ADD, md_dbfile)) as conn:
for i in xrange(50):
sid = songcount.index[i]
title, artist = get_song_info_from_sid(conn, sid)
print "%s BY %s -- count: %d" % (title, artist, songcount[i])
In [20]:
playcount = tp[['sid', 'count']]
In [21]:
playcount_groupbysid = playcount.groupby('sid', as_index=False)
In [22]:
songcount = playcount_groupbysid.sum().sort('count', ascending=False)
print songcount
In [23]:
unique_sid = pd.unique(tp['sid'])
n_songs = len(unique_sid)
# Shuffle songs
np.random.seed(98765)
idx = np.random.permutation(np.arange(n_songs))
unique_sid = unique_sid[idx]
In [24]:
print n_songs
unique_uid = pd.unique(tp['uid'])
In [25]:
# Map song/user ID to indices
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))
In [26]:
with open('unique_uid.txt', 'w') as f:
for uid in unique_uid:
f.write('%s\n' % uid)
with open('unique_sid.txt', 'w') as f:
for sid in unique_sid:
f.write('%s\n' % sid)
with open('song2id.json', 'w') as f:
json.dump(song2id, f)
with open('user2id.json', 'w') as f:
json.dump(user2id, f)
In [30]:
in_sid = unique_sid[:int(0.95 * n_songs)]
out_sid = unique_sid[int(0.95 * n_songs):]
In [31]:
print out_sid.shape
In [32]:
out_tp = tp[tp['sid'].isin(out_sid)]
out_tp
Out[32]:
In [33]:
in_tp = tp[~tp['sid'].isin(out_sid)]
in_tp
Out[33]:
Pick out 20% of the rating for in-matrix prediction
In [34]:
np.random.seed(12345)
n_ratings = in_tp.shape[0]
test = np.random.choice(n_ratings, size=int(0.20 * n_ratings), replace=False)
In [35]:
test_idx = np.zeros(n_ratings, dtype=bool)
test_idx[test] = True
test_tp = in_tp[test_idx]
train_tp = in_tp[~test_idx]
Make sure there is no empty row or column in the training data
In [36]:
print len(pd.unique(train_tp['uid']))
print len(pd.unique(in_tp['uid']))
In [37]:
print len(pd.unique(train_tp['sid']))
print len(pd.unique(in_tp['sid']))
Pick out 10% of the training rating as validation set
In [38]:
np.random.seed(13579)
n_ratings = train_tp.shape[0]
vad = np.random.choice(n_ratings, size=int(0.10 * n_ratings), replace=False)
In [39]:
vad_idx = np.zeros(n_ratings, dtype=bool)
vad_idx[vad] = True
vad_tp = train_tp[vad_idx]
train_tp = train_tp[~vad_idx]
In [40]:
print len(pd.unique(train_tp['uid']))
print len(pd.unique(in_tp['uid']))
In [41]:
print len(pd.unique(train_tp['sid']))
print len(pd.unique(in_tp['sid']))
In [42]:
test_tp.to_csv('in.test.csv', index=False)
In [43]:
train_tp.to_csv('in.train.csv', index=False)
In [44]:
vad_tp.to_csv('in.vad.csv', index=False)
In [45]:
out_tp.to_csv('out.test.csv', index=False)
In [ ]: