In [1]:
%matplotlib inline

In [2]:
import json
import os
import sqlite3
import sys
import pickle
import numpy as np
import pandas as pd

In [3]:
TP_file = 'train_triplets.txt'
md_dbfile = 'track_metadata.db'

In [4]:
tp = pd.read_table(TP_file, header=None, names=['uid', 'sid', 'count'])

In [5]:
MIN_USER_COUNT = 20
MIN_SONG_COUNT = 50

Keep play counts that involve only usable tracks


In [7]:
# tid2sid.json contains a mapping between track id and song id, which can obtained from track_metadata.db
with open('tid2sid.json', 'r') as f:
    tid2sid = json.load(f)

In [8]:
bad_audio = []

with open('tracks_bad_audio.txt', 'r') as f:
    for line in f:
        bad_audio.append(line.strip())

In [9]:
bad_sid = [tid2sid[k] for k in bad_audio]

In [10]:
def filter_usable_tracks(tp, bad_sid):
    return tp[~tp['sid'].isin(bad_sid)]

tp_good = filter_usable_tracks(tp, bad_sid)

In [11]:
print '%d playcount triplets are kept out of %d'% (len(tp_good), len(tp))


46968690 playcount triplets are kept out of 48373586

Further filter out counts invoving inactive users & songs


In [12]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id, 'count']].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def remove_inactive(tp, min_uc=MIN_USER_COUNT, min_sc=MIN_SONG_COUNT):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    songcount = get_count(tp, 'sid')
    tp = tp[tp['sid'].isin(songcount.index[songcount >= min_sc])]
    
    # Only keep the triplets for users who listened to at least min_uc songs
    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
    usercount = get_count(tp, 'uid')
    tp = tp[tp['uid'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(tp, 'uid'), get_count(tp, 'sid') 
    return tp, usercount, songcount

In [13]:
tp, usercount, songcount = remove_inactive(tp_good)

In [14]:
sparsity_level = float(tp.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After filtering, there are %d triplets from %d users and %d songs (sparsity level %.3f%%)" % (tp.shape[0], 
                                                                                                      usercount.shape[0], 
                                                                                                      songcount.shape[0], 
                                                                                                      sparsity_level * 100)


After filtering, there are 38226302 triplets from 613682 users and 97414 songs (sparsity level 0.064%)

In [15]:
usercount.hist(bins=100)


Out[15]:
<matplotlib.axes.AxesSubplot at 0x7e2a61d0>

In [16]:
songcount.hist(bins=100)


Out[16]:
<matplotlib.axes.AxesSubplot at 0x7e2ab850>

In [17]:
songcount.sort(ascending=False)

In [18]:
def get_song_info_from_sid(conn, sid):
    cur = conn.cursor()
    cur.execute("SELECT title, artist_name FROM songs WHERE song_id = '%s'" % (sid))
    title, artist = cur.fetchone()
    return title, artist

In [19]:
# take a look at the top 50 most listened songs
with sqlite3.connect(os.path.join(MSD_ADD, md_dbfile)) as conn:
    for i in xrange(50):
        sid = songcount.index[i]
        title, artist = get_song_info_from_sid(conn, sid)
        print "%s BY %s -- count: %d" % (title, artist, songcount[i])


Sehr kosmisch BY Harmonia -- count: 80277
Dog Days Are Over (Radio Edit) BY Florence + The Machine -- count: 71975
Undo BY Björk -- count: 63486
Secrets BY OneRepublic -- count: 60764
You're The One BY Dwight Yoakam -- count: 60062
Revelry BY Kings Of Leon -- count: 59131
Fireflies BY Charttraxx Karaoke -- count: 50561
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile) BY Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner -- count: 50029
Hey_ Soul Sister BY Train -- count: 50013
Tive Sim BY Cartola -- count: 44583
OMG BY Usher featuring will.i.am -- count: 41360
Drop The World BY Lil Wayne / Eminem -- count: 39214
The Scientist BY Coldplay -- count: 38856
Canada BY Five Iron Frenzy -- count: 37666
Clocks BY Coldplay -- count: 36879
Marry Me BY Train -- count: 36732
Catch You Baby (Steve Pitron & Max Sanna Radio Edit) BY Lonnie Gordon -- count: 35097
Pursuit Of Happiness (nightmare) BY Kid Cudi / MGMT / Ratatat -- count: 34032
Lucky (Album Version) BY Jason Mraz & Colbie Caillat -- count: 33137
Bulletproof BY La Roux -- count: 32884
Alejandro BY Lady GaGa -- count: 32345
Creep (Explicit) BY Radiohead -- count: 32231
Just Dance BY Lady GaGa / Colby O'Donis -- count: 31958
Billionaire [feat. Bruno Mars]  (Explicit Album Version) BY Travie McCoy -- count: 31932
Sincerité Et Jalousie BY Alliance Ethnik -- count: 31547
Représente BY Alliance Ethnik -- count: 30885
The Only Exception (Album Version) BY Paramore -- count: 29614
Invalid BY Tub Ring -- count: 28354
Bleed It Out [Live At Milton Keynes] BY Linkin Park -- count: 28197
I Gotta Feeling BY Black Eyed Peas -- count: 28048
Ain't Misbehavin BY Sam Cooke -- count: 27195
Heartbreak Warfare BY John Mayer -- count: 26389
When You Were Young BY The Killers -- count: 26378
Fix You BY Coldplay -- count: 26352
Livin' On A Prayer BY Bon Jovi -- count: 26347
The Gift BY Angels and Airwaves -- count: 25823
Float On BY Modest Mouse -- count: 25196
Cosmic Love BY Florence + The Machine -- count: 25167
Halo BY Beyoncé -- count: 24744
Kryptonite BY 3 Doors Down -- count: 24716
Uprising BY Muse -- count: 24605
Party In The U.S.A. BY Miley Cyrus -- count: 24448
Sample Track 2 BY Simon Harris -- count: 24185
I CAN'T GET STARTED BY Ron Carter -- count: 24104
Bitter Sweet Symphony BY The Verve -- count: 23469
You've Got The Love BY Florence + The Machine -- count: 22879
Home BY Edward Sharpe & The Magnetic Zeros -- count: 22751
Sexy Bitch BY DJ Dizzy -- count: 22623
I Kissed A Girl BY Katy Perry -- count: 22605
Electric Feel BY MGMT -- count: 22398

Generate in- and out-of-matrix split

Get all users & songs in filtered taste profile, shuffle them, and map to integer indices


In [20]:
playcount = tp[['sid', 'count']]

In [21]:
playcount_groupbysid = playcount.groupby('sid', as_index=False)

In [22]:
songcount = playcount_groupbysid.sum().sort('count', ascending=False)
print songcount


                      sid   count
6147   SOBONKR12A58A7A7E0  530291
3174   SOAUWYT12A81C206F1  462084
72395  SOSXLTC12AF72A7F54  381805
22489  SOFRQTD12A81C233C0  318278
3524   SOAXGDH12A8C13F8A1  293453
16885  SOEGIYH12A6D4FC0E3  287323
54300  SONYKOW12AB01849C9  232204
61043  SOPUCYA12A8C13A694  231973
77055  SOUFTBI12AB0183F65  203117
55382  SOOFYTN12A6D4F9B35  186374
30671  SOHTKMO12AB01843B0  183617
80275  SOVDSJC12A58A7A271  177617
6192   SOBOUPA12A6D4F81F1  175260
13465  SODJWHY12A8C142CCE  166712
44144  SOLFXKT12AB017E3E0  157726
73078  SOTCMDJ12A6D4F8528  155002
21564  SOFLJQZ12A6D4FADA6  146176
75813  SOTWNDJ12A8C143984  136096
78205  SOUNZHU12A8AE47481  123716
78843  SOUSMXX12AB0185C24  121482
79277  SOUVTSM12AC468F6A7  119187
41186  SOKLRPJ12A8C13C3FE  110497
66762  SORJICW12A8C13640D  106462
6080   SOBOAFP12A8C131F36  106053
60813  SOPSOHT12A67AE0235  105932
83973  SOWEHOM12A6BD4E09E  103151
76755  SOUDLVN12AAFF43658  103042
60403  SOPPROJ12AB0184E18  101875
11269  SOCVTLJ12A6310F0FD   99453
60919  SOPTLQL12AB018D56F   95937
...                   ...     ...
32101  SOIDATE12A8C131200      53
74223  SOTKYOX12A81C2061A      53
40963  SOKKFHW12A67020AFB      53
21522  SOFLDYI12A8C13C19F      53
10035  SOCNQPD12AB0186432      53
32170  SOIDNEP12A8C13C90B      53
27029  SOGVISR12A8AE48718      53
30890  SOHVCTM12A8C14128B      53
24138  SOGCOVN12A58A79189      52
19454  SOEXHUE12A6D4F9E3C      52
84562  SOWINAL12A8C138F58      52
50777  SOMZYTR12A8C13CC7D      52
70327  SOSIWXC12A8C1372DF      52
60903  SOPTGZW12A6D4F64A0      52
89091  SOXPXBM12AB01820E7      51
6681   SOBRRRB12A6D4F7328      51
70609  SOSKRFC12A6701D619      51
42073  SOKSBES12A58A7E021      51
42806  SOKWZLI12A6310F139      51
73908  SOTINBI12A58A80B04      51
48795  SOMMEYQ12A8C131BC5      50
1001   SOAGPYR12A6701DE67      50
76083  SOTYOJT12AB018718B      50
14085  SODOBPL12A8C142066      49
58343  SOPATVC12A8C14385E      49
48113  SOMHMKS12A81C22AC8      49
70379  SOSJELD12AB018A529      49
17334  SOEJGMO12A6D4F8333      49
27596  SOGZAUQ12AAFF442CE      48
31615  SOIAAWS12A58A7B3A0      47

[97414 rows x 2 columns]

In [23]:
unique_sid = pd.unique(tp['sid'])
n_songs = len(unique_sid)
# Shuffle songs
np.random.seed(98765)
idx = np.random.permutation(np.arange(n_songs))
unique_sid = unique_sid[idx]

In [24]:
print n_songs
unique_uid = pd.unique(tp['uid'])


97414

In [25]:
# Map song/user ID to indices
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [26]:
with open('unique_uid.txt', 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)
        
with open('unique_sid.txt', 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
        
with open('song2id.json', 'w') as f:
    json.dump(song2id, f)

with open('user2id.json', 'w') as f:
    json.dump(user2id, f)

Select 5% songs for out-of-matrix prediction


In [30]:
in_sid = unique_sid[:int(0.95 * n_songs)]
out_sid = unique_sid[int(0.95 * n_songs):]

In [31]:
print out_sid.shape


(4871,)

In [32]:
out_tp = tp[tp['sid'].isin(out_sid)]
out_tp


Out[32]:
uid sid count
3 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFNSP12AF72A0E22 1
50 b80344d063b5ccb3212f76538f3d9e43d87dca9e SONRXOY12AB0181E84 1
96 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOXRXDG12A8C131DE5 1
110 85c1f87fea955d09b4bec2e36aee110927aedf9a SOHANDU12A8C13C47F 1
120 85c1f87fea955d09b4bec2e36aee110927aedf9a SOTVFEF12AF729E6CE 3
123 85c1f87fea955d09b4bec2e36aee110927aedf9a SOUSQUG12A8C13616F 2
158 969cc6fb74e076a68e36a04409cb9d3765757508 SOGFUFC12A8C13F1E5 6
266 b64cdd1a0bd907e5e00b39e345194768e330d652 SOCBRSN12AAF3B30A6 2
268 b64cdd1a0bd907e5e00b39e345194768e330d652 SOCSISN12AF72AB1DE 2
275 b64cdd1a0bd907e5e00b39e345194768e330d652 SOEWYLX12A6D4F8E5F 3
276 b64cdd1a0bd907e5e00b39e345194768e330d652 SOFNCRW12A6D4F727B 2
322 b64cdd1a0bd907e5e00b39e345194768e330d652 SOQXKUV12A6D4FB4C9 1
339 b64cdd1a0bd907e5e00b39e345194768e330d652 SOUZRCP12AB0182164 1
342 b64cdd1a0bd907e5e00b39e345194768e330d652 SOVMCAR12AF72A1268 1
348 b64cdd1a0bd907e5e00b39e345194768e330d652 SOWSWTD12A67ADA3D7 1
366 17aa9f6dbdf753831da8f38c71b66b64373de613 SOBDVAK12AC90759A2 1
388 17aa9f6dbdf753831da8f38c71b66b64373de613 SOEOJHS12AB017F3DC 2
393 17aa9f6dbdf753831da8f38c71b66b64373de613 SOFKYDZ12AB017F425 1
423 17aa9f6dbdf753831da8f38c71b66b64373de613 SOJITNW12A8C13D951 2
510 d6589314c0a9bcbca4fee0c93b14bc402363afea SODLSCE12A6D4FBCAC 1
559 5a905f000fc1ff3df7ca807d57edb608863db05d SOAFOBL12AF72A25BA 12
569 5a905f000fc1ff3df7ca807d57edb608863db05d SOAOFBI12A8C143E28 1
587 5a905f000fc1ff3df7ca807d57edb608863db05d SOBOJJB12A58A7D1AD 3
595 5a905f000fc1ff3df7ca807d57edb608863db05d SOCBNIS12AF72AB9D3 2
596 5a905f000fc1ff3df7ca807d57edb608863db05d SOCEWVG12A8C13DCC2 1
669 5a905f000fc1ff3df7ca807d57edb608863db05d SOGIEOU12A8C134815 1
677 5a905f000fc1ff3df7ca807d57edb608863db05d SOGVWGI12A8C13B9D1 1
683 5a905f000fc1ff3df7ca807d57edb608863db05d SOHBURV12A8C13B628 1
687 5a905f000fc1ff3df7ca807d57edb608863db05d SOHIDCT12AB018C98E 1
691 5a905f000fc1ff3df7ca807d57edb608863db05d SOHWBGO12A6D4FA87A 2
... ... ... ...
48372670 3b91968ca65411893d356bb96e7cce1e3fe8f764 SOKPWKZ12AB0182223 1
48372676 3b91968ca65411893d356bb96e7cce1e3fe8f764 SOLWHDY12A6310DFE5 1
48372697 3b91968ca65411893d356bb96e7cce1e3fe8f764 SOPFJGB12A6702166F 3
48372731 3b91968ca65411893d356bb96e7cce1e3fe8f764 SOVCNHZ12AB0180982 1
48372867 67b00a32e0d314eb059016a24864d5e7ee8219b8 SOGOPZL12A8C13CC50 1
48372886 67b00a32e0d314eb059016a24864d5e7ee8219b8 SOQARDA12B0B809080 1
48372921 69e03764ed3bb92a765bd73ef273fcc479f63754 SONGTTS12A6701E59A 13
48373009 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOARLEM12AF729FFB8 1
48373072 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOMCAFM12A58A7B024 11
48373077 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOMUENG12A8C1442F3 2
48373085 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOOAFDW12A8C13325B 2
48373091 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOOVJTE12A8C132892 1
48373096 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOOZFCC12A58A7D783 14
48373115 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOSKFED12A8C1343AB 1
48373123 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOTGZIH12A8C1428A5 2
48373124 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOTKNTF12A8C144A0D 2
48373139 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOVBRCP12A6701D7B5 2
48373144 4d5b26d3f618ce63ba018fe34d57e71f1b8e2184 SOWIGII12A58A7A939 16
48373241 8305c896f42308824da7d4386f4b9ee584281412 SODLAPJ12A8C142002 2
48373254 8305c896f42308824da7d4386f4b9ee584281412 SOEWYLX12A6D4F8E5F 1
48373324 8305c896f42308824da7d4386f4b9ee584281412 SOLRGVL12A8C143BC3 1
48373329 8305c896f42308824da7d4386f4b9ee584281412 SOMCAFM12A58A7B024 6
48373356 8305c896f42308824da7d4386f4b9ee584281412 SOPREHY12AB01815F9 3
48373377 8305c896f42308824da7d4386f4b9ee584281412 SOSCIZP12AB0181D2F 1
48373396 8305c896f42308824da7d4386f4b9ee584281412 SOTKYBW12A8C13C3EA 3
48373428 8305c896f42308824da7d4386f4b9ee584281412 SOVWADY12AB0189C63 5
48373442 8305c896f42308824da7d4386f4b9ee584281412 SOWYYUQ12A6701D68D 1
48373451 8305c896f42308824da7d4386f4b9ee584281412 SOXZSEH12AC468CABB 1
48373548 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SODJQXO12A6D4F697D 2
48373549 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOEISDE12A8AE4632E 1

1922113 rows × 3 columns


In [33]:
in_tp = tp[~tp['sid'].isin(out_sid)]
in_tp


Out[33]:
uid sid count
0 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAKIMP12A8C130995 1
1 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAPDEY12A81C210A9 1
2 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBBMDR12A8C13253B 2
4 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFOVM12A58A7D494 1
6 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBSUJE12A6D4F8CF5 2
7 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBVFZR12A6D4F8AE3 1
8 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBXALG12A8C13C108 1
10 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBYHAJ12A6701BF1D 1
11 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOCNMUH12A6D4F6E6D 1
12 b80344d063b5ccb3212f76538f3d9e43d87dca9e SODACBL12A8C13C273 1
13 b80344d063b5ccb3212f76538f3d9e43d87dca9e SODDNQT12A6D4F5F7E 5
14 b80344d063b5ccb3212f76538f3d9e43d87dca9e SODXRTY12AB0180F3B 1
15 b80344d063b5ccb3212f76538f3d9e43d87dca9e SODZWFT12A8C13C0E4 1
16 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOEGVZY12A58A7857E 1
18 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOEOBYG12A6D4F8AE2 1
19 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOEWFWM12A8C1308BA 1
20 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOFFJPX12A6D4F7456 1
21 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOFGUAY12AB017B0A8 1
22 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOFRQTD12A81C233C0 1
24 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOFZFQU12A8C13CAB8 1
25 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOGJAOS12A6D4F7459 1
26 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOHQIAG12A8C136F64 1
27 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOHQWYZ12A6D4FA701 1
28 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOHQZCA12A6D4FB317 1
29 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOIAOBY12A8C13BF75 1
30 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOIQOQT12A8C136F96 1
31 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOISWBZ12A8C13C0F7 1
32 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOIYTOA12A6D4F9A23 1
33 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOIZAZL12A6701C53B 5
34 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOJNNUA12A8AE48C7A 1
... ... ... ...
48373552 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOHHKGO12AC3DF57BF 1
48373553 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOHLLRP12A6701F2F4 1
48373554 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOHMACD12A6D4F9582 1
48373555 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOHYSXA12AB0186704 1
48373556 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOIBAQJ12AB0182643 1
48373557 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOJGZXL12A6D4F2980 1
48373558 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOJZZQW12A6702028B 1
48373560 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOMAKIT12A58A7E292 1
48373561 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SONWLIS12A8C140865 2
48373562 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOOAAGD12AB017BCDA 1
48373563 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOOENDM12A6D2281CB 1
48373564 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOOFYTN12A6D4F9B35 4
48373566 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOPCTBB12AF72A1B64 1
48373567 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOPJLFV12A6701C797 1
48373568 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOPUELG12A6701D215 1
48373569 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOQBOWE12A8C13CC2E 1
48373570 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SORFHOZ12A6701E129 1
48373571 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SORPVUD12A67020454 1
48373572 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOSCPOI12A8C139F02 1
48373573 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOSKDTM12A6701C795 1
48373574 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOSQJWM12A6D4F79E0 6
48373576 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOTFOAE12A6D4F4511 2
48373577 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOTIXTZ12AF72A39AC 1
48373579 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOTULEI12A58A7CB72 1
48373580 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOTYMDI12A6D4F867D 1
48373581 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOUHHHH12AF729E4AF 2
48373582 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOUJVIT12A8C1451C1 1
48373583 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOUSMXX12AB0185C24 1
48373584 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOWYSKH12AF72A303A 3
48373585 b7815dbb206eb2831ce0fe040d0aa537e2e800f7 SOYYFLV12A58A7A88F 1

36304189 rows × 3 columns

Generate train/test/vad sets

Pick out 20% of the rating for in-matrix prediction


In [34]:
np.random.seed(12345)
n_ratings = in_tp.shape[0]
test = np.random.choice(n_ratings, size=int(0.20 * n_ratings), replace=False)

In [35]:
test_idx = np.zeros(n_ratings, dtype=bool)
test_idx[test] = True

test_tp = in_tp[test_idx]
train_tp = in_tp[~test_idx]

Make sure there is no empty row or column in the training data


In [36]:
print len(pd.unique(train_tp['uid']))
print len(pd.unique(in_tp['uid']))


613682
613682

In [37]:
print len(pd.unique(train_tp['sid']))
print len(pd.unique(in_tp['sid']))


92543
92543

Pick out 10% of the training rating as validation set


In [38]:
np.random.seed(13579)
n_ratings = train_tp.shape[0]
vad = np.random.choice(n_ratings, size=int(0.10 * n_ratings), replace=False)

In [39]:
vad_idx = np.zeros(n_ratings, dtype=bool)
vad_idx[vad] = True

vad_tp = train_tp[vad_idx]
train_tp = train_tp[~vad_idx]

In [40]:
print len(pd.unique(train_tp['uid']))
print len(pd.unique(in_tp['uid']))


613682
613682

In [41]:
print len(pd.unique(train_tp['sid']))
print len(pd.unique(in_tp['sid']))


92543
92543

In [42]:
test_tp.to_csv('in.test.csv', index=False)

In [43]:
train_tp.to_csv('in.train.csv', index=False)

In [44]:
vad_tp.to_csv('in.vad.csv', index=False)

In [45]:
out_tp.to_csv('out.test.csv', index=False)

In [ ]: