In [1]:
from pyspark.ml.clustering import LDA

In [2]:
# Loads data.
dataset = spark.read.format("libsvm").load("datasets/sample_lda_libsvm_data.txt")
dataset.show()


+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(11,[0,1,2,4,5,6,...|
|  1.0|(11,[0,1,3,4,7,10...|
|  2.0|(11,[0,1,2,5,6,8,...|
|  3.0|(11,[0,1,3,6,8,9,...|
|  4.0|(11,[0,1,2,3,4,6,...|
|  5.0|(11,[0,1,3,4,5,6,...|
|  6.0|(11,[0,1,3,6,8,9,...|
|  7.0|(11,[0,1,2,3,4,5,...|
|  8.0|(11,[0,1,3,4,5,6,...|
|  9.0|(11,[0,1,2,4,6,8,...|
| 10.0|(11,[0,1,2,3,5,6,...|
| 11.0|(11,[0,1,4,5,6,7,...|
+-----+--------------------+


In [3]:
# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

In [4]:
ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))


The lower bound on the log likelihood of the entire corpus: -830.8200362218348
The upper bound on perplexity: 3.195461678640353

In [5]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)


The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[10, 6, 1] |[0.17716277365583435, 0.175093826940891, 0.14398144339477317]  |
|1    |[0, 5, 9]  |[0.10767384081908464, 0.09803424340533426, 0.09707083774679913]|
|2    |[5, 10, 9] |[0.09819703267561146, 0.09813706321638012, 0.09566066687701354]|
|3    |[5, 10, 2] |[0.1043336472136259, 0.10204514734224286, 0.09789654769297573] |
|4    |[5, 6, 8]  |[0.17117267209890458, 0.10008771147187673, 0.09380215424402512]|
|5    |[2, 1, 5]  |[0.10181812241305552, 0.09675765527782697, 0.09604418553503413]|
|6    |[6, 4, 9]  |[0.10646588514827376, 0.10135478933291643, 0.099179157965757]  |
|7    |[8, 3, 5]  |[0.10453789038581693, 0.09705020776286659, 0.09687785234996922]|
|8    |[2, 1, 5]  |[0.1120434496672337, 0.09725984515892873, 0.09706870440450541] |
|9    |[9, 1, 8]  |[0.10446944547350019, 0.09727480238748507, 0.09680748146108896]|
+-----+-----------+---------------------------------------------------------------+


In [6]:
# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)


+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                       |topicDistribution                                                                                                                                                                                                     |
+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.3718280662219035,0.004731039286517012,0.004731037198720411,0.004731064541657108,0.5903235632630186,0.004731099086829484,0.004730992538051158,0.00473099334717997,0.004731160704278749,0.00473098381184402]         |
|1.0  |(11,[0,1,3,4,7,10],[1.0,3.0,1.0,3.0,2.0,1.0])                  |[0.9286861998712865,0.007893531749948047,0.007893646863281253,0.00789355237659673,0.008164677068801076,0.007893662543738824,0.007893710102898665,0.007893664692561678,0.007893771452907154,0.00789358327798002]       |
|2.0  |(11,[0,1,2,5,6,8,9],[1.0,4.0,1.0,4.0,9.0,1.0,2.0])             |[0.7561399289149608,0.004112941535886246,0.004112891094110662,0.004112905720725229,0.21095683808003063,0.004112898333358668,0.004112889536434559,0.004112897679643027,0.004112949588325215,0.00411285951652493]       |
|3.0  |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,3.0,9.0])            |[0.9671365050987096,0.0036376341897825077,0.003637616238556454,0.0036376266237594873,0.003762536128212053,0.003637609931794757,0.0036376582861276387,0.003637608027904693,0.003637588820890415,0.003637616654262329]  |
|4.0  |(11,[0,1,2,3,4,6,9,10],[3.0,1.0,1.0,9.0,3.0,2.0,1.0,3.0])      |[0.9643924346542688,0.003941292441598135,0.003941306894015766,0.0039413411349832434,0.004076685236006074,0.0039413467199025266,0.003941637259064026,0.003941321804680829,0.003941344792698896,0.003941289062781617]   |
|5.0  |(11,[0,1,3,4,5,6,7,8,9],[4.0,2.0,3.0,4.0,5.0,1.0,1.0,1.0,4.0]) |[0.6227614435446506,0.0036378775187673675,0.003637861946497583,0.003637848767858997,0.34813566665078777,0.003637863641471235,0.00363787017839175,0.0036378615734538426,0.0036378706040802494,0.0036378355740406937]   |
|6.0  |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,2.0,9.0])            |[0.9658201959314034,0.0037833222540161284,0.0037833110712154725,0.003783321222017469,0.0039132515202502886,0.003783304991699418,0.0037833439276022344,0.003783320791498692,0.003783280840746704,0.0037833474495500077]|
|7.0  |(11,[0,1,2,3,4,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,1.0,2.0,1.0,3.0])|[0.9611493187771615,0.004300192636019096,0.004300222491834816,0.0043002858627943525,0.004448743318060316,0.004300273683639019,0.004300249522777038,0.004300248763095088,0.004300275785197903,0.004300189159420707]    |
|8.0  |(11,[0,1,3,4,5,6,7],[4.0,4.0,3.0,4.0,2.0,1.0,3.0])             |[0.9611476216448558,0.004300441886885103,0.00430046294470608,0.004300435430590479,0.004448763856355035,0.004300468102895199,0.00430042726632079,0.0043004701795092744,0.00430051511806285,0.004300393569819682]       |
|9.0  |(11,[0,1,2,4,6,8,9,10],[2.0,8.0,2.0,3.0,2.0,2.0,7.0,2.0])      |[0.9705392416099374,0.003260982898783804,0.003260957832603433,0.0032609562082985305,0.003372987856205201,0.003260977063994761,0.003260977081013557,0.003260955579471199,0.0032609947767366543,0.003260969092955616]   |
|10.0 |(11,[0,1,2,3,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,2.0,3.0,3.0])      |[0.9628406434355251,0.004113033595220575,0.004113049069443663,0.00411310417732073,0.004254883056396133,0.004113076996472518,0.0041130305651535925,0.004113070953360243,0.004113095440641269,0.004113012710466222]     |
|11.0 |(11,[0,1,4,5,6,7,9],[4.0,1.0,4.0,5.0,1.0,3.0,1.0])             |[0.30052330492266666,0.00473114111051668,0.004731120783736787,0.0047310373869180345,0.6616278804456178,0.004731109497263788,0.004731124226328385,0.004731090245898748,0.004731138965891898,0.004731052415161245]      |
+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


In [ ]: