In [28]:
from pyspark.mllib.feature import Word2Vec
inp = sc.textFile('./data/new_parsed_no_spam.txt').map(lambda row: row.split(" "))

In [29]:
from pyspark.mllib.feature import IDF
from pyspark.mllib.feature import HashingTF

hashingTF = HashingTF()
tf = hashingTF.transform(inp)

# ... continue from the previous example
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [52]:
tfidf.take(100)


Out[52]:
[SparseVector(1048576, {0: 0.0, 83112: 11.114}),
 SparseVector(1048576, {0: 0.0, 6401: 4.6609, 401685: 10.6031, 458193: 8.4514, 516311: 8.0537, 529670: 7.8306, 559704: 5.7748, 560922: 7.3147, 745492: 5.8634, 1002347: 11.5194, 1028853: 5.4259}),
 SparseVector(1048576, {0: 0.0, 873006: 11.5194}),
 SparseVector(1048576, {0: 0.0, 168150: 9.7277, 199052: 8.9937, 208021: 7.9641, 743649: 11.5194, 895496: 7.1953}),
 SparseVector(1048576, {0: 0.0, 614308: 11.5194, 767648: 9.2169, 788738: 4.8525}),
 SparseVector(1048576, {0: 0.0, 33833: 9.5735, 39073: 8.499, 180773: 10.8263, 312962: 5.3915, 433627: 7.7017, 484967: 11.5194, 505370: 9.3222, 540100: 4.9047, 608985: 7.2781, 691743: 11.5194, 725033: 5.9155, 802715: 10.8263}),
 SparseVector(1048576, {0: 0.0, 355610: 11.5194, 877377: 9.1681}),
 SparseVector(1048576, {0: 0.0, 39073: 8.499, 180773: 10.8263, 226269: 11.114, 312962: 5.3915, 433627: 7.7017, 505370: 9.3222, 540100: 4.9047, 608985: 7.2781, 725033: 5.9155}),
 SparseVector(1048576, {0: 0.0, 96912: 11.5194, 316312: 4.9648, 877377: 9.1681}),
 SparseVector(1048576, {0: 0.0, 1016147: 11.114}),
 SparseVector(1048576, {0: 0.0, 402502: 10.6031, 829787: 8.8804, 868560: 11.5194, 881041: 8.0855, 1015648: 8.9167}),
 SparseVector(1048576, {0: 0.0, 917151: 11.114}),
 SparseVector(1048576, {0: 0.0, 510034: 5.689, 838019: 6.7702, 868859: 6.8799, 965722: 10.1331}),
 SparseVector(1048576, {0: 0.0, 1014297: 11.114}),
 SparseVector(1048576, {0: 0.0, 151148: 10.0154, 168150: 9.7277, 458193: 8.4514, 560922: 7.3147, 674252: 11.5194, 964810: 9.91, 1015648: 8.9167}),
 SparseVector(1048576, {0: 0.0, 168150: 9.7277, 401685: 10.6031, 560922: 7.3147, 637615: 5.2242, 722827: 11.5194, 881041: 8.0855, 1015648: 8.9167}),
 SparseVector(1048576, {0: 0.0, 771414: 11.5194}),
 SparseVector(1048576, {0: 0.0, 151148: 10.0154, 458193: 8.4514, 560922: 7.3147, 819989: 11.5194, 983331: 5.7991, 1015648: 8.9167}),
 SparseVector(1048576, {0: 0.0, 480056: 11.5194}),
 SparseVector(1048576, {0: 0.0, 169138: 11.114, 528519: 11.114, 849130: 4.4864, 1039912: 4.8718}),
 SparseVector(1048576, {0: 0.0, 281547: 11.114}),
 SparseVector(1048576, {0: 0.0, 156854: 7.8688, 232972: 11.5194, 235663: 9.0771, 637615: 5.2242, 780091: 9.0771}),
 SparseVector(1048576, {0: 0.0, 378709: 11.5194}),
 SparseVector(1048576, {0: 0.0, 195361: 7.3684, 330134: 11.5194, 637615: 5.2242}),
 SparseVector(1048576, {0: 0.0, 475871: 11.5194}),
 SparseVector(1048576, {0: 0.0, 151148: 10.0154, 427280: 11.5194, 849130: 4.4864}),
 SparseVector(1048576, {0: 0.0, 573017: 11.5194}),
 SparseVector(1048576, {0: 0.0, 524442: 11.5194, 730767: 5.819}),
 SparseVector(1048576, {0: 0.0, 941619: 11.5194}),
 SparseVector(1048576, {0: 0.0, 893044: 11.5194}),
 SparseVector(1048576, {0: 0.0, 743106: 11.5194}),
 SparseVector(1048576, {0: 0.0, 791681: 11.5194}),
 SparseVector(1048576, {0: 0.0, 207327: 10.6031, 429834: 10.1331, 646072: 11.5194, 921712: 11.5194}),
 SparseVector(1048576, {0: 0.0, 694535: 11.5194}),
 SparseVector(1048576, {0: 0.0, 166838: 6.4956, 218043: 9.0345, 229254: 9.1681, 268151: 8.2808, 313739: 11.5194, 468162: 7.5212, 505252: 5.7574, 548926: 11.5194, 652066: 7.0827, 730767: 5.819, 801522: 5.8057, 836730: 10.6031, 851610: 6.1511, 992265: 5.7264, 1045724: 5.6264}),
 SparseVector(1048576, {0: 0.0, 597501: 11.5194}),
 SparseVector(1048576, {0: 0.0, 78088: 7.2638, 95833: 10.8263, 133553: 11.114, 229254: 9.1681, 316312: 4.9648, 451764: 11.5194, 851927: 6.7278}),
 SparseVector(1048576, {0: 0.0, 56041: 10.8263, 242612: 9.7277, 264416: 4.5264, 371199: 11.5194, 458576: 5.0801, 500339: 11.5194, 583379: 10.1331, 851610: 6.1511, 917638: 7.1756}),
 SparseVector(1048576, {0: 0.0, 83162: 11.114, 153147: 8.0855, 203453: 11.114, 255311: 7.1437, 446640: 11.5194, 518952: 11.5194, 579945: 11.5194, 718611: 7.8059, 814860: 11.5194, 1045265: 10.1331}),
 SparseVector(1048576, {0: 0.0, 131737: 11.114}),
 SparseVector(1048576, {0: 0.0, 28759: 7.0768, 156221: 11.5194, 788738: 4.8525, 796075: 4.934, 800095: 9.7277, 975766: 7.7017}),
 SparseVector(1048576, {0: 0.0, 10118: 8.8453, 107646: 11.5194, 153147: 8.0855, 242612: 9.7277, 370866: 7.5975, 637615: 5.2242, 788738: 4.8525, 835927: 7.8688, 1039542: 5.8444}),
 SparseVector(1048576, {0: 0.0, 59059: 11.5194, 423946: 8.5237, 583379: 10.1331, 1012894: 11.114}),
 SparseVector(1048576, {0: 0.0, 10484: 11.5194, 278514: 6.9143, 340223: 8.1695, 529094: 7.0364, 821024: 7.2853, 939653: 9.3794}),
 SparseVector(1048576, {0: 0.0, 59961: 7.3222, 350401: 11.114, 409970: 5.819, 799530: 4.8776}),
 SparseVector(1048576, {0: 0.0, 301826: 11.5194}),
 SparseVector(1048576, {0: 0.0, 10118: 8.8453, 57292: 8.7469, 242612: 9.7277, 253255: 11.5194, 370866: 7.5975, 436802: 11.5194}),
 SparseVector(1048576, {0: 0.0, 117857: 7.0364, 204792: 11.5194, 511216: 11.5194}),
 SparseVector(1048576, {0: 0.0, 544725: 11.5194}),
 SparseVector(1048576, {0: 0.0, 496150: 11.5194, 609159: 4.9381, 896720: 10.2667}),
 SparseVector(1048576, {0: 0.0, 617780: 10.8263}),
 SparseVector(1048576, {0: 0.0, 328977: 6.9294, 429960: 5.4936, 666355: 11.5194, 917176: 11.114}),
 SparseVector(1048576, {0: 0.0, 166838: 6.4956, 242612: 9.7277, 318818: 9.5045, 406578: 11.5194, 714942: 11.114, 788738: 4.8525, 798982: 5.2214, 850889: 7.15}),
 SparseVector(1048576, {0: 0.0, 217944: 6.9815, 370866: 7.5975, 558846: 11.114, 763517: 11.5194}),
 SparseVector(1048576, {0: 0.0, 24845: 11.114, 64477: 11.114, 166779: 11.114, 200017: 10.8263, 233266: 11.5194, 266070: 11.114, 286304: 22.2279, 537446: 10.6031, 563683: 10.8263, 564294: 10.8263, 602543: 10.6031, 760028: 10.8263, 762207: 10.6031, 788204: 11.114, 812088: 11.5194, 934703: 11.5194}),
 SparseVector(1048576, {0: 0.0, 55467: 5.6278, 416048: 5.921, 860551: 11.114}),
 SparseVector(1048576, {0: 0.0, 200017: 10.8263, 602543: 10.6031, 909122: 11.5194}),
 SparseVector(1048576, {0: 0.0, 957697: 11.5194}),
 SparseVector(1048576, {0: 0.0, 266070: 11.114, 286304: 11.114, 760028: 10.8263, 1006284: 10.8263}),
 SparseVector(1048576, {0: 0.0, 6283: 11.5194}),
 SparseVector(1048576, {0: 0.0, 536463: 10.8263, 563683: 10.8263, 762207: 10.6031}),
 SparseVector(1048576, {0: 0.0, 487872: 8.1182}),
 SparseVector(1048576, {0: 0.0, 59961: 7.3222, 633609: 11.5194, 838019: 6.7702, 887467: 6.8419}),
 SparseVector(1048576, {0: 0.0, 585034: 11.5194, 602543: 10.6031}),
 SparseVector(1048576, {0: 0.0, 103239: 11.5194, 227786: 7.4335, 269651: 6.5222, 308286: 5.5393, 342267: 11.5194, 903532: 11.5194}),
 SparseVector(1048576, {0: 0.0, 293692: 11.5194}),
 SparseVector(1048576, {0: 0.0, 439301: 11.5194}),
 SparseVector(1048576, {0: 0.0, 24845: 11.114, 390726: 11.5194}),
 SparseVector(1048576, {0: 0.0, 147959: 11.114, 226012: 9.3794, 299221: 6.8847, 508658: 6.3265, 531014: 6.0243, 539824: 10.0154, 577645: 8.0855, 602543: 10.6031, 611383: 10.8263, 809873: 9.91, 841339: 7.6379, 1012166: 6.856, 1015976: 10.8263}),
 SparseVector(1048576, {0: 0.0, 99368: 11.5194}),
 SparseVector(1048576, {0: 0.0, 760028: 10.8263, 998022: 11.5194}),
 SparseVector(1048576, {0: 0.0, 1046597: 11.114}),
 SparseVector(1048576, {0: 0.0, 166779: 11.114, 900988: 11.114}),
 SparseVector(1048576, {0: 0.0, 949563: 11.114}),
 SparseVector(1048576, {0: 0.0, 143754: 11.114}),
 SparseVector(1048576, {0: 0.0, 192329: 11.114, 762207: 10.6031}),
 SparseVector(1048576, {0: 0.0, 46592: 11.5194}),
 SparseVector(1048576, {0: 0.0, 95183: 11.114, 564294: 10.8263}),
 SparseVector(1048576, {0: 0.0, 56523: 6.4728, 340755: 11.5194, 508658: 6.3265, 587576: 7.7818, 609159: 4.9381, 609518: 11.5194, 762207: 10.6031, 844265: 6.1819, 867703: 7.7818, 885828: 10.2667, 897391: 9.6476}),
 SparseVector(1048576, {0: 0.0, 658093: 11.5194}),
 SparseVector(1048576, {0: 0.0, 51277: 5.3742, 328977: 6.9294, 411121: 11.114, 429960: 5.4936, 798982: 5.2214, 966488: 9.5735}),
 SparseVector(1048576, {0: 0.0, 233853: 4.9, 362546: 11.114, 566017: 7.5212, 788738: 4.8525, 983073: 5.1461}),
 SparseVector(1048576, {0: 0.0, 64477: 11.114, 313975: 10.8263}),
 SparseVector(1048576, {0: 0.0, 265384: 11.114}),
 SparseVector(1048576, {0: 0.0, 216813: 10.8263, 609159: 4.9381}),
 SparseVector(1048576, {0: 0.0, 168238: 11.5194, 193520: 11.114, 224967: 10.6031}),
 SparseVector(1048576, {0: 0.0, 119651: 11.5194}),
 SparseVector(1048576, {0: 0.0, 71076: 11.5194}),
 SparseVector(1048576, {0: 0.0, 37759: 11.114, 83714: 8.8804, 92883: 8.8453, 93419: 11.5194, 185150: 8.8804, 208812: 7.0086, 216280: 8.7469, 241742: 11.114, 362557: 4.6375, 363450: 5.8057, 382807: 7.724, 384862: 10.2667, 390177: 11.5194, 401189: 10.4208, 409082: 10.0154, 435082: 10.8263, 518887: 6.7573, 530270: 11.5194, 531014: 6.0243, 591405: 8.3208, 704386: 11.5194, 724016: 7.3528, 748794: 5.6431, 795801: 9.91, 796465: 6.6481, 799625: 11.5194, 804191: 8.575, 847479: 10.2667, 887142: 11.114, 945478: 8.135, 985194: 8.9545, 1015068: 10.8263, 1032685: 5.7558, 1042478: 5.9549}),
 SparseVector(1048576, {0: 0.0, 84758: 10.4208, 96212: 8.7161, 115376: 6.9922, 219874: 6.46, 249366: 11.114, 259326: 11.5194, 271649: 6.0945, 276408: 10.8263, 315943: 11.5194, 320619: 11.5194, 343424: 10.6031, 389054: 10.0154, 409970: 5.819, 419100: 8.6572, 479477: 11.114, 481016: 5.9454, 506709: 9.5045, 535182: 5.4792, 583379: 10.1331, 647939: 10.8263, 708511: 9.0771, 730767: 5.819, 745050: 11.5194, 751050: 11.5194, 904126: 8.3006, 1009012: 7.7582, 1039893: 11.5194}),
 SparseVector(1048576, {0: 0.0, 208812: 7.0086, 210930: 7.5305, 229254: 9.1681, 269741: 5.6817, 294438: 11.5194, 363450: 5.8057, 415783: 5.2627, 433970: 5.2123, 531014: 6.0243, 588499: 5.7023, 651046: 10.8263, 720745: 9.8147, 763397: 8.0079, 803316: 11.5194, 834300: 7.3684, 849691: 7.0593, 853585: 8.2613, 872680: 11.5194, 902647: 6.5324, 945044: 11.114, 962864: 8.5237, 1020809: 8.4514}),
 SparseVector(1048576, {0: 0.0, 229254: 9.1681, 473577: 7.5975, 481016: 5.9454, 531014: 6.0243, 583379: 20.2663, 609159: 4.9381, 649107: 7.0946, 921271: 11.114, 933670: 5.7249, 951068: 7.3528, 963404: 8.4284, 992409: 5.5267}),
 SparseVector(1048576, {0: 0.0, 579173: 9.7277, 755147: 10.4208, 969842: 11.5194, 1020048: 11.5194}),
 SparseVector(1048576, {0: 0.0, 1018417: 9.7277}),
 SparseVector(1048576, {0: 0.0, 10118: 8.8453, 10461: 7.2999, 51209: 11.5194, 59961: 7.3222, 84758: 10.4208, 164135: 7.0768, 167916: 7.1374, 207422: 7.5031, 242612: 9.7277, 254574: 5.7829, 295400: 5.5143, 301832: 8.4284, 308286: 5.5393, 384862: 10.2667, 399804: 11.5194, 409970: 5.819, 441386: 7.3451, 498733: 6.6674, 531014: 6.0243, 583379: 10.1331, 608417: 9.3222, 660645: 11.5194, 678372: 11.5194, 690904: 11.5194, 709694: 10.6031, 802720: 11.114, 953467: 5.625, 989997: 9.7277, 997897: 4.6704, 1003328: 7.7467, 1009012: 7.7582, 1014966: 8.7786}),
 SparseVector(1048576, {0: 0.0, 266984: 11.114, 573015: 10.4208, 579173: 9.7277, 726947: 11.5194}),
 SparseVector(1048576, {0: 0.0, 92546: 8.8114, 113600: 11.5194, 187871: 11.5194, 233853: 4.9, 362557: 4.6375, 370866: 7.5975, 373872: 11.114, 384862: 10.2667, 415087: 6.2542, 419100: 8.6572, 498733: 6.6674, 531014: 6.0243, 583379: 10.1331, 775534: 11.5194, 885028: 10.4208, 885092: 6.46, 983073: 5.1461}),
 SparseVector(1048576, {0: 0.0, 824109: 11.114}),
 SparseVector(1048576, {0: 0.0, 212608: 11.5194}),
 SparseVector(1048576, {0: 0.0, 89216: 11.5194, 135491: 11.5194, 200374: 11.114, 261199: 9.6476, 488324: 11.5194, 534529: 11.114, 916313: 8.135})]

In [48]:
# a = inp.take(100)[88][1:]
# for i in a :
#     print '##'+i+'##'
wordlist = inp.map(lambda x : x[1:]).flatMap(lambda x : x).filter(lambda x : len(x)>1).distinct()
wordKeyMap = wordlist.map(lambda x : (hashingTF.indexOf(x),x))
wordKeyMap.take(10)


Out[48]:
[(0, u'360.07'),
 (131074, u'\u58c1\u5e03'),
 (677206, u'\u5713\u982d'),
 (786438, u'\u81bd\u77f3'),
 (917514, u'\u7b2c\u4e8c'),
 (131084, u'\u934a\u5b50'),
 (917518, u'\u966a\u8ab2'),
 (262160, u'\u5c0f\u817f'),
 (742744, u'\u5f8c\u76fe'),
 (131090, u'\u5e73\u5b89')]

In [ ]:
word2vec = Word2Vec()
model = word2vec.fit(inp)

In [64]:
be =   (model.transform('店家')+ model.transform('店名'))*2 - model.transform('難吃')*2 + model.transform('好吃')*1
synonyms = model.findSynonyms(be, 40)
for word, cosine_distance in synonyms:
    print("{}: {}".format(word.encode('utf-8'), cosine_distance))


店名: 0.797977149487
法豆鮮: 0.664511144161
洋朵: 0.655015051365
法樂米: 0.638847589493
簡閱: 0.638360321522
Enalley: 0.634707808495
Pasta: 0.630737066269
咬蛋: 0.628726840019
青鳥: 0.622638106346
輕食: 0.618459939957
溫德德式: 0.61318975687
百珍: 0.604635715485
一中店: 0.604245662689
燒鳥: 0.604139566422
阿財: 0.603386342525
Famonn: 0.603010118008
鹿點: 0.602878332138
PELLEO: 0.601222574711
nybc: 0.600034415722
五莖: 0.598786234856
Caf: 0.598756730556
錦津澤: 0.598662495613
簡餐: 0.598346114159
福記: 0.597760081291
CAFE: 0.597102999687
江豪記: 0.595591723919
凱蘿琳: 0.59482383728
樂祈: 0.594637215137
Corrine: 0.594626069069
Coucou: 0.592837154865
千陽: 0.591681301594
甘泉: 0.59039837122
添喜: 0.59003084898
甜子: 0.589581847191
激推: 0.589004635811
雪綿冰: 0.588553547859
Nooice: 0.587925672531
芳鄰: 0.587363898754
筑馨: 0.587215006351
李冰: 0.586992442608

In [29]:
keyword='美白'
be =  model.transform(keyword) 
print '與 ' + keyword + " 相關的詞如下:" 
synonyms = model.findSynonyms(be,40)
for word, cosine_distance in synonyms:
    print("{}: {}".format(word.encode('utf-8'), cosine_distance))


與 美白 相關的詞如下:
保濕: 0.823526799679
具長: 0.80883038044
抗老: 0.790402948856
淨白組: 0.790108561516
安瓶: 0.789486169815
淡斑: 0.78782171011
水凝賦: 0.780795931816
精華: 0.780294597149
水潤亮: 0.773762404919
清爽型: 0.77225291729
活膚: 0.771992623806
柳晶凍: 0.768778383732
嫩白: 0.765475511551
純萃: 0.762607932091
乳液: 0.761080265045
傳明酸: 0.760533332825
愛斯德瑪: 0.758291423321
雙效: 0.757012546062
淨白: 0.756984233856
透白雙: 0.754908919334
液是: 0.752150833607
全能型: 0.75085657835
肌能露: 0.750433027744
LOVEISDERMA: 0.746375143528
水凝霜: 0.74311631918
水潤煥顏: 0.74299120903
修護: 0.742734849453
玻尿酸: 0.742720723152
淨斑: 0.742118656635
透白: 0.741148293018
白潤: 0.739855110645
水妍煥白: 0.737419307232
晶鑽桂馥: 0.736793279648
光透: 0.73637598753
送水潤亮: 0.734945476055
多效: 0.734880805016
熊果素: 0.734564125538
廣第: 0.732760488987
雪晶靈: 0.732185900211
面霜: 0.732169210911

In [65]:
be


Out[65]:
DenseVector([-0.5831, -0.0254, -0.6506, -0.1149, -0.0559, -0.6841, 0.1396, 0.1775, 0.3189, -0.2631, 0.0938, -0.236, 0.3044, 0.8797, -0.509, 0.0442, 0.4863, -0.1886, -0.3797, 0.0238, 1.1025, -0.4459, 0.8146, 0.6528, -0.6891, 0.9531, 0.0162, -1.2586, -0.3959, -0.0841, -0.7814, -0.1908, 0.5102, 0.7478, 0.0797, 0.7211, 0.7922, -0.0515, -0.0501, -0.1959, 1.643, -0.3111, 0.1522, -0.2409, 1.114, -1.1185, -0.5039, -0.638, -0.7649, 0.2792, 1.1702, -1.2677, 0.7007, 0.0455, -1.3227, -1.04, -1.035, -0.7153, 0.393, -0.0203, -0.2007, 0.5502, 0.7178, 0.1892, 0.0204, -0.6733, 0.361, 0.5874, -0.2428, -0.5127, 0.226, 0.6347, 0.6623, 1.3486, -0.4423, 0.2986, 0.2983, 0.5466, 0.3545, -1.5598, -0.0183, 1.3901, -0.4882, 0.6302, -0.0693, -0.1519, 0.5629, 0.2237, 1.0279, -0.6714, -0.8228, -0.0837, 0.0153, 0.6642, -1.5185, 0.5264, 0.5412, -0.3727, -0.4923, -0.2429])

In [ ]: