You're going to want to install all the requirements to get going. I've included a requirements.txt file you can install with pip install -r requirements.txt.
In [4]:
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext # needed for toDF()
In [4]:
users = sc.cassandraTable("demo", "user").toDF()
food_count = users.select("favorite_food").groupBy("favorite_food").count()
food_count.collect()
In [3]:
# RDD counting example
u = sc.cassandraTable("demo", "user")
u.map(lambda x: (x['favorite_food'], 1)).\
reduceByKey(lambda x, y: x + y).collect()
Out[3]:
In [4]:
users = sc.cassandraTable("demo", "user").toDF()
food_count = users.select("favorite_food").\
groupBy("favorite_food").count()
food_count.collect()
Out[4]:
In [5]:
sql = SQLContext(sc)
users = sc.cassandraTable("demo", "user").toDF()
users.registerTempTable("users")
sql.sql("""select favorite_food, count(favorite_food)
from users group by favorite_food """).collect()
Out[5]:
In [6]:
result = sql.sql("""select favorite_food, count(favorite_food)
from users group by favorite_food """).toPandas()
In [22]:
%matplotlib inline
In [14]:
result.set_index("favorite_food")
Out[14]:
In [23]:
result.set_index("favorite_food").plot(kind="pie",y="c1", legend=False)
Out[23]:
In [20]:
sc._gateway.jvm.java.util.HashMap()
Out[20]:
In [5]:
sql = SQLContext(sc)
options = sc._gateway.jvm.java.util.HashMap()
options["keyspace"] = "labor"
options["c_table"] = "average_price_data"
sql.load(source="org.apache.spark.sql.cassandra",
options=options)
In [14]:
sc._gateway
Out[14]:
In [15]:
gw = sc._gateway
In [ ]: