In [1]:
%matplotlib inline
In [2]:
import json
import os
import sys
import numpy as np
import scipy.sparse
import pandas as pd
In [3]:
with open('user2id.json', 'r') as f:
user2id = json.load(f)
with open('song2id.json', 'r') as f:
song2id = json.load(f)
In [4]:
train_tp = pd.read_csv('in.train.csv')
In [5]:
uid = map(lambda x: user2id[x], train_tp['uid'])
sid = map(lambda x: song2id[x], train_tp['sid'])
In [6]:
train_tp['uid'] = uid
train_tp['sid'] = sid
In [7]:
train_tp
Out[7]:
uid
sid
count
0
0
235110
1
1
0
176423
1
2
0
14039
1
3
0
256592
1
4
0
144595
1
5
1
84597
5
6
1
78712
11
7
1
38088
5
8
2
122359
10
9
3
63360
1
10
3
108146
1
11
3
151870
3
12
3
184120
1
13
4
65724
2
14
4
208355
2
15
4
247144
2
16
4
244040
1
17
4
225622
2
18
4
14142
1
19
4
75307
2
20
4
214683
2
21
4
240609
2
22
4
90723
1
23
4
61866
2
24
4
144329
2
25
4
189095
2
26
4
194541
2
27
4
213741
2
28
5
220387
7
29
7
158813
1
...
...
...
...
1762223
546484
75731
1
1762224
548836
256958
1
1762225
549051
127386
1
1762226
549363
247086
1
1762227
550842
67193
1
1762228
552060
171551
1
1762229
553942
249315
5
1762230
554181
18372
1
1762231
554283
138650
1
1762232
554701
86088
1
1762233
555315
39964
2
1762234
555437
67505
1
1762235
555896
163195
2
1762236
556564
249702
5
1762237
557202
136270
1
1762238
557381
21972
1
1762239
557704
4517
1
1762240
557727
171965
1
1762241
557855
240145
1
1762242
558437
172609
5
1762243
558525
56060
1
1762244
559550
146573
1
1762245
560092
106778
1
1762246
561619
128766
3
1762247
562519
62040
1
1762248
562623
24589
2
1762249
562719
235646
1
1762250
563895
194499
7
1762251
563957
120751
2
1762252
563957
116745
2
1762253 rows × 3 columns
In [8]:
train_tp.to_csv('in.train.num.csv', index=False)
In [9]:
test_tp = pd.read_csv('in.test.csv')
In [10]:
print len(pd.unique(test_tp['sid']))
155006
In [11]:
uid = map(lambda x: user2id[x], test_tp['uid'])
sid = map(lambda x: song2id[x], test_tp['sid'])
In [12]:
test_tp['uid'] = uid
test_tp['sid'] = sid
In [13]:
test_tp
Out[13]:
uid
sid
count
0
2
31830
8
1
2
210172
2
2
3
182784
1
3
3
158231
1
4
3
48021
1
5
3
7825
1
6
3
44082
1
7
4
111631
5
8
4
213191
2
9
4
84101
1
10
4
18108
2
11
6
148557
1
12
9
254013
1
13
9
74303
1
14
17
91550
2
15
17
253348
1
16
20
104556
2
17
20
79111
2
18
24
196803
2
19
24
249725
1
20
24
252702
1
21
26
80626
2
22
26
69553
1
23
26
237640
1
24
28
103111
1
25
28
116234
1
26
28
17439
1
27
28
153150
3
28
29
234694
1
29
32
28644
2
...
...
...
...
410051
564379
1172
2
410052
564379
78343
1
410053
564380
236598
6
410054
564381
76192
1
410055
564383
145288
1
410056
564383
18327
1
410057
564383
253693
6
410058
564383
180983
1
410059
564385
257914
1
410060
564387
207577
2
410061
564388
229721
1
410062
564388
179002
2
410063
564392
141731
1
410064
564392
184982
1
410065
564395
88367
1
410066
564395
85599
1
410067
564395
215315
1
410068
564396
253037
1
410069
564401
87279
7
410070
564407
119263
43
410071
564407
246113
1
410072
564407
69796
1
410073
564407
56500
2
410074
564409
143683
1
410075
564413
152456
1
410076
564413
139461
1
410077
564425
32877
1
410078
564425
47475
1
410079
564432
19157
2
410080
564435
109679
7
410081 rows × 3 columns
In [14]:
test_tp.to_csv('in.test.num.csv', index=False)
In [15]:
vad_tp = pd.read_csv('in.vad.csv')
In [16]:
print len(pd.unique(vad_tp['sid']))
98344
In [17]:
uid = map(lambda x: user2id[x], vad_tp['uid'])
sid = map(lambda x: song2id[x], vad_tp['sid'])
In [18]:
vad_tp['uid'] = uid
vad_tp['sid'] = sid
In [19]:
vad_tp
Out[19]:
uid
sid
count
0
4
30236
2
1
24
83616
1
2
24
249741
1
3
25
115571
1
4
26
51973
1
5
29
60242
5
6
32
65626
1
7
34
109581
11
8
39
104679
1
9
41
172822
8
10
44
212439
1
11
47
113561
4
12
49
251199
1
13
50
9349
1
14
52
28295
1
15
53
110911
3
16
57
190629
1
17
58
215632
6
18
61
190384
4
19
64
94911
1
20
68
95426
3
21
74
130087
1
22
76
148630
1
23
77
52916
1
24
82
199421
1
25
82
222116
1
26
82
18124
1
27
94
218229
1
28
94
153550
3
29
109
253721
4
...
...
...
...
162497
509349
77426
3
162498
510279
83346
1
162499
513173
30397
2
162500
515502
251888
1
162501
515680
6862
1
162502
517203
200720
1
162503
520692
214912
1
162504
521701
184298
1
162505
523400
152964
1
162506
525144
222545
1
162507
527649
181417
2
162508
528254
77881
1
162509
528482
145415
5
162510
531173
235268
1
162511
534328
201650
1
162512
536518
36494
6
162513
537108
68975
1
162514
540174
99626
1
162515
541867
88201
2
162516
543408
229661
2
162517
548231
131175
1
162518
548788
208027
1
162519
550608
49050
3
162520
553301
163583
1
162521
554209
206248
2
162522
554329
190735
1
162523
555258
163640
28
162524
560257
225023
8
162525
561400
108387
1
162526
561619
126924
4
162527 rows × 3 columns
In [20]:
vad_tp.to_csv('in.vad.num.csv', index=False)
In [21]:
out_tp = pd.read_csv('out.test.csv')
In [22]:
uid = map(lambda x: user2id[x], out_tp['uid'])
sid = map(lambda x: song2id[x], out_tp['sid'])
In [23]:
out_tp['uid'] = uid
out_tp['sid'] = sid
In [24]:
out_tp
Out[24]:
uid
sid
count
0
0
94610
1
1
0
92713
1
2
0
94485
1
3
1
96234
1
4
1
94802
3
5
1
93741
2
6
2
96220
6
7
4
93682
2
8
4
97181
2
9
4
96955
3
10
4
94430
2
11
4
94233
1
12
4
93311
1
13
4
93213
1
14
4
96100
1
15
5
93036
1
16
5
94813
2
17
5
96189
1
18
5
95814
2
19
6
94706
1
20
7
96325
12
21
7
93082
1
22
7
97056
3
23
7
96665
2
24
7
95739
1
25
7
96053
1
26
7
95114
1
27
7
96477
1
28
7
93233
1
29
7
95963
2
...
...
...
...
1922083
613673
97048
1
1922084
613673
97404
1
1922085
613673
94407
3
1922086
613673
97000
1
1922087
613675
93455
1
1922088
613675
95145
1
1922089
613676
96474
13
1922090
613678
93571
1
1922091
613678
96099
11
1922092
613678
94862
2
1922093
613678
94185
2
1922094
613678
93738
1
1922095
613678
96336
14
1922096
613678
92925
1
1922097
613678
95983
2
1922098
613678
93451
2
1922099
613678
96126
2
1922100
613678
95383
16
1922101
613679
92830
2
1922102
613679
96955
1
1922103
613679
96267
1
1922104
613679
96099
6
1922105
613679
94096
3
1922106
613679
95041
1
1922107
613679
96060
3
1922108
613679
94402
5
1922109
613679
97358
1
1922110
613679
96551
1
1922111
613681
93449
2
1922112
613681
96872
1
1922113 rows × 3 columns
In [25]:
out_tp.to_csv('out.test.num.csv', index=False)
In [ ]:
Content source: dawenl/content_wmf
Similar notebooks: