notebook.community

Edit and run



In [3]:

    
from pyspark import SparkContext, SparkConf, SQLContext, HiveContext, StorageLevel
from pyspark.sql.functions import *
sc = SparkContext()
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

		
# # Save and load model
# model.save(sc, "myModelPath")
# sameModel = LDAModel.load(sc, "myModelPath")









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-3-968a8d35229a> in <module>()
      1 from pyspark import SparkContext, SparkConf, SQLContext, HiveContext, StorageLevel
      2 from pyspark.sql.functions import *
----> 3 sc = SparkContext()
      4 from pyspark.mllib.clustering import LDA, LDAModel
      5 from pyspark.mllib.linalg import Vectors

/home/cs598rk/spark/python/pyspark/context.pyc in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
    108         """
    109         self._callsite = first_spark_call() or CallSite(None, None, None)
--> 110         SparkContext._ensure_initialized(self, gateway=gateway)
    111         try:
    112             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,

/home/cs598rk/spark/python/pyspark/context.pyc in _ensure_initialized(cls, instance, gateway)
    248                         " created by %s at %s:%s "
    249                         % (currentAppName, currentMaster,
--> 250                             callsite.function, callsite.file, callsite.linenum))
    251                 else:
    252                     SparkContext._active_spark_context = instance

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-1-968a8d35229a>:3



In [4]:

    
# Load and parse the data
data = sc.textFile("../../spark/data/mllib/sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))









    



Learned topics (as distributions over vocab of 11 words):
Topic 0:
 7.35254110036
 9.28016041683
 4.65280663595
 7.22676116134
 8.17626996187
 4.37199631072
 8.14477545339
 2.99763345248
 3.67211100451
 13.4292992994
 16.820468316
Topic 1:
 11.3984782036
 14.9260731769
 2.68158493025
 1.77198278252
 10.8681628516
 12.5549821443
 12.9924113112
 3.89711417818
 3.15726394821
 6.9966801128
 6.40833491169
Topic 2:
 7.24898069599
 4.79376640623
 4.6656084338
 31.0012560561
 5.95556718649
 5.07302154494
 9.86281323536
 3.10525236934
 1.17062504728
 3.57402058777
 9.77119677226



In [2]:

    
parsedData.collect()









    Out[2]:





[DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0]),
 DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0]),
 DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0]),
 DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 3.0, 9.0]),
 DenseVector([3.0, 1.0, 1.0, 9.0, 3.0, 0.0, 2.0, 0.0, 0.0, 1.0, 3.0]),
 DenseVector([4.0, 2.0, 0.0, 3.0, 4.0, 5.0, 1.0, 1.0, 1.0, 4.0, 0.0]),
 DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 2.0, 9.0]),
 DenseVector([1.0, 1.0, 1.0, 9.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0, 3.0]),
 DenseVector([4.0, 4.0, 0.0, 3.0, 4.0, 2.0, 1.0, 3.0, 0.0, 0.0, 0.0]),
 DenseVector([2.0, 8.0, 2.0, 0.0, 3.0, 0.0, 2.0, 0.0, 2.0, 7.0, 2.0]),
 DenseVector([1.0, 1.0, 1.0, 9.0, 0.0, 2.0, 2.0, 0.0, 0.0, 3.0, 3.0]),
 DenseVector([4.0, 1.0, 0.0, 0.0, 4.0, 5.0, 1.0, 3.0, 0.0, 1.0, 0.0])]



In [ ]: