In [8]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
# sc = SparkContext()

# Load and parse the data
data = sc.textFile("../../spark/data/mllib/sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
		
# Save and load model
# model.save(sc, "myModelPath")
# sameModel = LDAModel.load(sc, "myModelPath")


Learned topics (as distributions over vocab of 11 words):
Topic 0:
 8.2921404395
 11.9736546024
 2.33726928792
 8.15114161014
 8.17343541426
 6.24544307177
 11.9208304555
 2.41886400743
 3.96823064233
 9.53410668217
 12.4030216644
Topic 1:
 6.84150849558
 4.29742401585
 5.418799435
 26.9808508797
 4.60641542847
 3.32889797518
 10.6684822769
 1.39870102754
 1.94314639999
 4.66503073113
 17.298177637
Topic 2:
 10.8663510649
 12.7289213818
 4.24393127708
 4.86800751015
 12.2201491573
 12.4256589531
 8.41068726762
 6.18243496503
 2.08862295767
 9.8008625867
 3.29880069856

In [14]:
topics


Out[14]:
array([[  8.29214044,   6.8415085 ,  10.86635106],
       [ 11.9736546 ,   4.29742402,  12.72892138],
       [  2.33726929,   5.41879943,   4.24393128],
       [  8.15114161,  26.98085088,   4.86800751],
       [  8.17343541,   4.60641543,  12.22014916],
       [  6.24544307,   3.32889798,  12.42565895],
       [ 11.92083046,  10.66848228,   8.41068727],
       [  2.41886401,   1.39870103,   6.18243497],
       [  3.96823064,   1.9431464 ,   2.08862296],
       [  9.53410668,   4.66503073,   9.80086259],
       [ 12.40302166,  17.29817764,   3.2988007 ]])

In [9]:
data.take(3)


Out[9]:
[u'1 2 6 0 2 3 1 1 0 0 3', u'1 3 0 1 3 0 0 2 0 0 1', u'1 4 1 0 0 4 9 0 1 2 0']

In [10]:
parsedData.take(3)


Out[10]:
[DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0]),
 DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0]),
 DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0])]

In [11]:
corpus.take(3)


Out[11]:
[[0, DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0])],
 [1, DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0])],
 [2, DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0])]]

In [12]:
ldaModel


Out[12]:
<pyspark.mllib.clustering.LDAModel at 0x7fefb331a850>