In [1]:
%matplotlib inline

In [2]:
import json
import os
import sys

import numpy as np
import scipy.sparse
import pandas as pd

In [3]:
with open('user2id.json', 'r') as f:
    user2id = json.load(f)
    
with open('song2id.json', 'r') as f:
    song2id = json.load(f)

In [4]:
train_tp = pd.read_csv('in.train.csv')

In [5]:
uid = map(lambda x: user2id[x], train_tp['uid'])
sid = map(lambda x: song2id[x], train_tp['sid'])

In [6]:
train_tp['uid'] = uid
train_tp['sid'] = sid

In [7]:
train_tp


Out[7]:
uid sid count
0 0 235110 1
1 0 176423 1
2 0 14039 1
3 0 256592 1
4 0 144595 1
5 1 84597 5
6 1 78712 11
7 1 38088 5
8 2 122359 10
9 3 63360 1
10 3 108146 1
11 3 151870 3
12 3 184120 1
13 4 65724 2
14 4 208355 2
15 4 247144 2
16 4 244040 1
17 4 225622 2
18 4 14142 1
19 4 75307 2
20 4 214683 2
21 4 240609 2
22 4 90723 1
23 4 61866 2
24 4 144329 2
25 4 189095 2
26 4 194541 2
27 4 213741 2
28 5 220387 7
29 7 158813 1
... ... ... ...
1762223 546484 75731 1
1762224 548836 256958 1
1762225 549051 127386 1
1762226 549363 247086 1
1762227 550842 67193 1
1762228 552060 171551 1
1762229 553942 249315 5
1762230 554181 18372 1
1762231 554283 138650 1
1762232 554701 86088 1
1762233 555315 39964 2
1762234 555437 67505 1
1762235 555896 163195 2
1762236 556564 249702 5
1762237 557202 136270 1
1762238 557381 21972 1
1762239 557704 4517 1
1762240 557727 171965 1
1762241 557855 240145 1
1762242 558437 172609 5
1762243 558525 56060 1
1762244 559550 146573 1
1762245 560092 106778 1
1762246 561619 128766 3
1762247 562519 62040 1
1762248 562623 24589 2
1762249 562719 235646 1
1762250 563895 194499 7
1762251 563957 120751 2
1762252 563957 116745 2

1762253 rows × 3 columns


In [8]:
train_tp.to_csv('in.train.num.csv', index=False)

In [9]:
test_tp = pd.read_csv('in.test.csv')

In [10]:
print len(pd.unique(test_tp['sid']))


155006

In [11]:
uid = map(lambda x: user2id[x], test_tp['uid'])
sid = map(lambda x: song2id[x], test_tp['sid'])

In [12]:
test_tp['uid'] = uid
test_tp['sid'] = sid

In [13]:
test_tp


Out[13]:
uid sid count
0 2 31830 8
1 2 210172 2
2 3 182784 1
3 3 158231 1
4 3 48021 1
5 3 7825 1
6 3 44082 1
7 4 111631 5
8 4 213191 2
9 4 84101 1
10 4 18108 2
11 6 148557 1
12 9 254013 1
13 9 74303 1
14 17 91550 2
15 17 253348 1
16 20 104556 2
17 20 79111 2
18 24 196803 2
19 24 249725 1
20 24 252702 1
21 26 80626 2
22 26 69553 1
23 26 237640 1
24 28 103111 1
25 28 116234 1
26 28 17439 1
27 28 153150 3
28 29 234694 1
29 32 28644 2
... ... ... ...
410051 564379 1172 2
410052 564379 78343 1
410053 564380 236598 6
410054 564381 76192 1
410055 564383 145288 1
410056 564383 18327 1
410057 564383 253693 6
410058 564383 180983 1
410059 564385 257914 1
410060 564387 207577 2
410061 564388 229721 1
410062 564388 179002 2
410063 564392 141731 1
410064 564392 184982 1
410065 564395 88367 1
410066 564395 85599 1
410067 564395 215315 1
410068 564396 253037 1
410069 564401 87279 7
410070 564407 119263 43
410071 564407 246113 1
410072 564407 69796 1
410073 564407 56500 2
410074 564409 143683 1
410075 564413 152456 1
410076 564413 139461 1
410077 564425 32877 1
410078 564425 47475 1
410079 564432 19157 2
410080 564435 109679 7

410081 rows × 3 columns


In [14]:
test_tp.to_csv('in.test.num.csv', index=False)

In [15]:
vad_tp = pd.read_csv('in.vad.csv')

In [16]:
print len(pd.unique(vad_tp['sid']))


98344

In [17]:
uid = map(lambda x: user2id[x], vad_tp['uid'])
sid = map(lambda x: song2id[x], vad_tp['sid'])

In [18]:
vad_tp['uid'] = uid
vad_tp['sid'] = sid

In [19]:
vad_tp


Out[19]:
uid sid count
0 4 30236 2
1 24 83616 1
2 24 249741 1
3 25 115571 1
4 26 51973 1
5 29 60242 5
6 32 65626 1
7 34 109581 11
8 39 104679 1
9 41 172822 8
10 44 212439 1
11 47 113561 4
12 49 251199 1
13 50 9349 1
14 52 28295 1
15 53 110911 3
16 57 190629 1
17 58 215632 6
18 61 190384 4
19 64 94911 1
20 68 95426 3
21 74 130087 1
22 76 148630 1
23 77 52916 1
24 82 199421 1
25 82 222116 1
26 82 18124 1
27 94 218229 1
28 94 153550 3
29 109 253721 4
... ... ... ...
162497 509349 77426 3
162498 510279 83346 1
162499 513173 30397 2
162500 515502 251888 1
162501 515680 6862 1
162502 517203 200720 1
162503 520692 214912 1
162504 521701 184298 1
162505 523400 152964 1
162506 525144 222545 1
162507 527649 181417 2
162508 528254 77881 1
162509 528482 145415 5
162510 531173 235268 1
162511 534328 201650 1
162512 536518 36494 6
162513 537108 68975 1
162514 540174 99626 1
162515 541867 88201 2
162516 543408 229661 2
162517 548231 131175 1
162518 548788 208027 1
162519 550608 49050 3
162520 553301 163583 1
162521 554209 206248 2
162522 554329 190735 1
162523 555258 163640 28
162524 560257 225023 8
162525 561400 108387 1
162526 561619 126924 4

162527 rows × 3 columns


In [20]:
vad_tp.to_csv('in.vad.num.csv', index=False)

In [21]:
out_tp = pd.read_csv('out.test.csv')

In [22]:
uid = map(lambda x: user2id[x], out_tp['uid'])
sid = map(lambda x: song2id[x], out_tp['sid'])

In [23]:
out_tp['uid'] = uid
out_tp['sid'] = sid

In [24]:
out_tp


Out[24]:
uid sid count
0 0 94610 1
1 0 92713 1
2 0 94485 1
3 1 96234 1
4 1 94802 3
5 1 93741 2
6 2 96220 6
7 4 93682 2
8 4 97181 2
9 4 96955 3
10 4 94430 2
11 4 94233 1
12 4 93311 1
13 4 93213 1
14 4 96100 1
15 5 93036 1
16 5 94813 2
17 5 96189 1
18 5 95814 2
19 6 94706 1
20 7 96325 12
21 7 93082 1
22 7 97056 3
23 7 96665 2
24 7 95739 1
25 7 96053 1
26 7 95114 1
27 7 96477 1
28 7 93233 1
29 7 95963 2
... ... ... ...
1922083 613673 97048 1
1922084 613673 97404 1
1922085 613673 94407 3
1922086 613673 97000 1
1922087 613675 93455 1
1922088 613675 95145 1
1922089 613676 96474 13
1922090 613678 93571 1
1922091 613678 96099 11
1922092 613678 94862 2
1922093 613678 94185 2
1922094 613678 93738 1
1922095 613678 96336 14
1922096 613678 92925 1
1922097 613678 95983 2
1922098 613678 93451 2
1922099 613678 96126 2
1922100 613678 95383 16
1922101 613679 92830 2
1922102 613679 96955 1
1922103 613679 96267 1
1922104 613679 96099 6
1922105 613679 94096 3
1922106 613679 95041 1
1922107 613679 96060 3
1922108 613679 94402 5
1922109 613679 97358 1
1922110 613679 96551 1
1922111 613681 93449 2
1922112 613681 96872 1

1922113 rows × 3 columns


In [25]:
out_tp.to_csv('out.test.num.csv', index=False)

In [ ]: