In [7]:
# Initialization Spark in Python
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
In [9]:
# spark is an existing SparkSession
df = spark.read.json("/usr/apache/spark-2.0.2-bin-hadoop2.7/examples/src/main/resources/people.json")
# Displays the content of the DataFrame to stdout
df.show()
In [10]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()
# Select only the "name" column
df.select("name").show()
# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()
# Select people older than 21
df.filter(df['age'] > 21).show()
# Count people by age
df.groupBy("age").count().show()
In [11]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()
In [13]:
from pyspark.sql import Row
# Load a text file and convert each line to a Row.
lines = sc.textFile("/usr/apache/spark-2.0.2-bin-hadoop2.7/examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")
# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
print(name)
# Name: Justin
In [ ]: