Preprocess MovieLens-20M



In [1]:

    
import datetime
import json
import os
import time

import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import scipy.sparse

import seaborn as sns
sns.set(context="paper", font_scale=1.5, rc={"lines.linewidth": 2}, font='DejaVu Serif')



In [2]:

    
DATA_DIR = '/hdd2/dawen/data/ml-20m/'



In [3]:

    
def timestamp_to_date(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')



In [4]:

    
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)



In [5]:

    
# binarize the data (only keep ratings >= 4)
raw_data = raw_data[raw_data['rating'] > 3.5]



In [6]:

    
# sort the raw data accorindg to timestamp
raw_data = raw_data.sort_index(by=['timestamp'])



In [7]:

    
raw_data









    Out[7]:






  
    
      
      userId
      movieId
      rating
      timestamp
    
  
  
    
      4182421
      28507
      1176
      4.0
      789652004
    
    
      18950936
      131160
      47
      5.0
      789652009
    
    
      15688196
      108467
      57
      4.0
      822873600
    
    
      12341186
      85252
      70
      4.0
      822873600
    
    
      14452501
      99851
      1
      4.0
      822873600
    
    
      14452517
      99851
      58
      5.0
      822873600
    
    
      14452516
      99851
      55
      4.0
      822873600
    
    
      14452515
      99851
      52
      4.0
      822873600
    
    
      14452514
      99851
      50
      5.0
      822873600
    
    
      14452513
      99851
      47
      5.0
      822873600
    
    
      14452512
      99851
      45
      4.0
      822873600
    
    
      14452509
      99851
      39
      5.0
      822873600
    
    
      14452507
      99851
      32
      5.0
      822873600
    
    
      14452506
      99851
      31
      5.0
      822873600
    
    
      14452505
      99851
      21
      5.0
      822873600
    
    
      3056639
      20821
      32
      5.0
      822873600
    
    
      15688194
      108467
      11
      4.0
      822873600
    
    
      14452504
      99851
      19
      4.0
      822873600
    
    
      14452503
      99851
      18
      4.0
      822873600
    
    
      12341184
      85252
      60
      4.0
      822873600
    
    
      14452502
      99851
      10
      4.0
      822873600
    
    
      12341181
      85252
      50
      5.0
      822873600
    
    
      19424622
      134445
      11
      4.0
      822873600
    
    
      19424624
      134445
      21
      5.0
      822873600
    
    
      19424626
      134445
      45
      5.0
      822873600
    
    
      19424627
      134445
      58
      5.0
      822873600
    
    
      12341159
      85252
      2
      4.0
      822873600
    
    
      12341161
      85252
      7
      5.0
      822873600
    
    
      12341162
      85252
      10
      5.0
      822873600
    
    
      12341165
      85252
      17
      5.0
      822873600
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      19742824
      136690
      48394
      5.0
      1427775557
    
    
      19742805
      136690
      1136
      5.0
      1427775558
    
    
      19742831
      136690
      104841
      4.5
      1427775561
    
    
      15480802
      107073
      745
      5.0
      1427776814
    
    
      15480822
      107073
      5971
      5.0
      1427776816
    
    
      15480805
      107073
      1148
      4.0
      1427776833
    
    
      15480828
      107073
      92259
      4.0
      1427776892
    
    
      15480823
      107073
      6016
      5.0
      1427777118
    
    
      15480804
      107073
      858
      5.0
      1427777123
    
    
      15480826
      107073
      58559
      4.0
      1427777129
    
    
      15480819
      107073
      4993
      5.0
      1427777155
    
    
      15480821
      107073
      5952
      5.0
      1427777157
    
    
      15480818
      107073
      4306
      5.0
      1427777158
    
    
      15480825
      107073
      7153
      5.0
      1427777166
    
    
      15480816
      107073
      3793
      4.0
      1427777169
    
    
      15480798
      107073
      527
      5.0
      1427777203
    
    
      17877748
      123613
      109243
      4.0
      1427779965
    
    
      8378451
      57814
      7361
      4.0
      1427780465
    
    
      8378452
      57814
      7438
      5.0
      1427780468
    
    
      8378505
      57814
      108979
      5.0
      1427780517
    
    
      8378409
      57814
      1527
      4.0
      1427780519
    
    
      8378407
      57814
      1274
      5.0
      1427780571
    
    
      8378413
      57814
      1748
      4.5
      1427780617
    
    
      8378443
      57814
      6283
      5.0
      1427780623
    
    
      8378393
      57814
      924
      4.5
      1427780631
    
    
      8378423
      57814
      3527
      4.0
      1427780657
    
    
      8378468
      57814
      48774
      4.0
      1427780663
    
    
      8378405
      57814
      1240
      5.0
      1427781001
    
    
      8378415
      57814
      2311
      4.5
      1427781083
    
    
      12898527
      89081
      52458
      4.0
      1427782288
    
  

9995410 rows × 4 columns



In [8]:

    
tstamp = np.array(raw_data['timestamp'])



In [9]:

    
print("Time span of the dataset: From %s to %s" % 
      (timestamp_to_date(np.min(tstamp)), timestamp_to_date(np.max(tstamp))))









    



Time span of the dataset: From 1995-01-09 06:46:44 to 2015-03-31 02:11:28



In [10]:

    
# apparently the timestamps are ordered, check to make sure

for i in xrange(tstamp.size - 1):
    if tstamp[i] > tstamp[i + 1]:
        print("not ordered")

Confirmed the timestamps are ordered



In [11]:

    
plt.hist(tstamp, bins=50)
xticks = np.linspace(tstamp[0], tstamp[-1], 10)
plt.xticks(xticks, map(lambda x: timestamp_to_date(x)[:7], xticks), rotation=90)
pass

Now we select the data from 1995-01-01 to the last day as the dataset (i.e., all the dataset)



In [12]:

    
start_t = time.mktime(datetime.datetime.strptime("1995-01-01", "%Y-%m-%d").timetuple())



In [42]:

    
raw_data = raw_data[raw_data['timestamp'] >= start_t]

Take the first 80% of the data as train and validation set



In [96]:

    
tr_vd_raw_data = raw_data[:int(0.8 * raw_data.shape[0])]



In [97]:

    
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count



In [98]:

    
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    if min_sc > 0:
        songcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(songcount.index[songcount >= min_sc])]
    
    # Only keep the triplets for users who listened to at least min_uc songs
    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, songcount



In [99]:

    
tr_vd_raw_data, user_activity, item_popularity = filter_triplets(tr_vd_raw_data)



In [100]:

    
sparsity = 1. * tr_vd_raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (tr_vd_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))









    



After filtering, there are 7992863 watching events from 111148 users and 11711 movies (sparsity: 0.614%)



In [101]:

    
unique_uid = user_activity.index
unique_sid = item_popularity.index



In [102]:

    
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))



In [103]:

    
with open(os.path.join(DATA_DIR, 'pro', 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)



In [104]:

    
with open(os.path.join(DATA_DIR, 'pro', 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

Split 12.5% (10% of the total ratings) as validation set



In [105]:

    
np.random.seed(13579)
n_ratings = tr_vd_raw_data.shape[0]
vad = np.random.choice(n_ratings, size=int(0.125 * n_ratings), replace=False)



In [106]:

    
vad_idx = np.zeros(n_ratings, dtype=bool)
vad_idx[vad] = True

vad_raw_data = tr_vd_raw_data[vad_idx]
train_raw_data = tr_vd_raw_data[~vad_idx]

Make sure there is no empty users/items



In [107]:

    
print "There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_raw_data['userId'])), len(unique_uid))









    



There are total of 111148 unique users in the training set and 111148 unique users in the entire dataset



In [108]:

    
print "There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_raw_data['movieId'])), len(unique_sid))









    



There are total of 11612 unique items in the training set and 11711 unique items in the entire dataset



In [109]:

    
train_sid = set(pd.unique(train_raw_data['movieId']))



In [110]:

    
left_sid = list()
for i, sid in enumerate(unique_sid):
    if sid not in train_sid:
        left_sid.append(sid)



In [111]:

    
move_idx = vad_raw_data['movieId'].isin(left_sid)



In [112]:

    
train_raw_data = train_raw_data.append(vad_raw_data[move_idx])
vad_raw_data = vad_raw_data[~move_idx]



In [113]:

    
print "There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_raw_data['movieId'])), len(unique_sid))









    



There are total of 11711 unique items in the training set and 11711 unique items in the entire dataset

For test data, only keep the users and items that appear in the training/validation sets



In [114]:

    
test_raw_data = raw_data[int(0.8 * len(raw_data)):]



In [115]:

    
test_raw_data = test_raw_data[test_raw_data['movieId'].isin(unique_sid)]
test_raw_data = test_raw_data[test_raw_data['userId'].isin(unique_uid)]



In [116]:

    
print len(train_raw_data), len(vad_raw_data), len(test_raw_data)









    



6993860 999003 207161

Basic data information: what's the timespan for train/test?



In [117]:

    
train_timestamp = np.asarray(tr_vd_raw_data['timestamp'])
print("train: from %s to %s" % (timestamp_to_date(train_timestamp[0]), 
                                timestamp_to_date(train_timestamp[-1])))

test_timestamp = np.asarray(test_raw_data['timestamp'])
print("test: from %s to %s" % (timestamp_to_date(test_timestamp[0]), 
                               timestamp_to_date(test_timestamp[-1])))









    



train: from 1995-01-09 06:46:44 to 2009-10-19 06:51:15
test: from 2009-10-19 06:51:53 to 2015-03-31 02:11:28

Numerize the data into (timestamp, user_index, item_index) format



In [118]:

    
def numerize(tp):
    uid = map(lambda x: user2id[x], tp['userId'])
    sid = map(lambda x: song2id[x], tp['movieId'])
    tp['uid'] = uid
    tp['sid'] = sid
    return tp[['timestamp', 'uid', 'sid']]



In [119]:

    
train_data = numerize(train_raw_data)
train_data.to_csv(os.path.join(DATA_DIR, 'pro', 'train.csv'), index=False)



In [120]:

    
vad_data = numerize(vad_raw_data)
vad_data.to_csv(os.path.join(DATA_DIR, 'pro', 'validation.csv'), index=False)



In [121]:

    
test_data = numerize(test_raw_data)
test_data.to_csv(os.path.join(DATA_DIR, 'pro', 'test.csv'), index=False)



In [ ]:

	userId	movieId	rating	timestamp
4182421	28507	1176	4.0	789652004
18950936	131160	47	5.0	789652009
15688196	108467	57	4.0	822873600
12341186	85252	70	4.0	822873600
14452501	99851	1	4.0	822873600
14452517	99851	58	5.0	822873600
14452516	99851	55	4.0	822873600
14452515	99851	52	4.0	822873600
14452514	99851	50	5.0	822873600
14452513	99851	47	5.0	822873600
14452512	99851	45	4.0	822873600
14452509	99851	39	5.0	822873600
14452507	99851	32	5.0	822873600
14452506	99851	31	5.0	822873600
14452505	99851	21	5.0	822873600
3056639	20821	32	5.0	822873600
15688194	108467	11	4.0	822873600
14452504	99851	19	4.0	822873600
14452503	99851	18	4.0	822873600
12341184	85252	60	4.0	822873600
14452502	99851	10	4.0	822873600
12341181	85252	50	5.0	822873600
19424622	134445	11	4.0	822873600
19424624	134445	21	5.0	822873600
19424626	134445	45	5.0	822873600
19424627	134445	58	5.0	822873600
12341159	85252	2	4.0	822873600
12341161	85252	7	5.0	822873600
12341162	85252	10	5.0	822873600
12341165	85252	17	5.0	822873600
...	...	...	...	...
19742824	136690	48394	5.0	1427775557
19742805	136690	1136	5.0	1427775558
19742831	136690	104841	4.5	1427775561
15480802	107073	745	5.0	1427776814
15480822	107073	5971	5.0	1427776816
15480805	107073	1148	4.0	1427776833
15480828	107073	92259	4.0	1427776892
15480823	107073	6016	5.0	1427777118
15480804	107073	858	5.0	1427777123
15480826	107073	58559	4.0	1427777129
15480819	107073	4993	5.0	1427777155
15480821	107073	5952	5.0	1427777157
15480818	107073	4306	5.0	1427777158
15480825	107073	7153	5.0	1427777166
15480816	107073	3793	4.0	1427777169
15480798	107073	527	5.0	1427777203
17877748	123613	109243	4.0	1427779965
8378451	57814	7361	4.0	1427780465
8378452	57814	7438	5.0	1427780468
8378505	57814	108979	5.0	1427780517
8378409	57814	1527	4.0	1427780519
8378407	57814	1274	5.0	1427780571
8378413	57814	1748	4.5	1427780617
8378443	57814	6283	5.0	1427780623
8378393	57814	924	4.5	1427780631
8378423	57814	3527	4.0	1427780657
8378468	57814	48774	4.0	1427780663
8378405	57814	1240	5.0	1427781001
8378415	57814	2311	4.5	1427781083
12898527	89081	52458	4.0	1427782288