References:
In [1]:
from pyspark.sql import SparkSession
In [2]:
spark = SparkSession.builder \
.appName("Python Spark SQL basic example") \
.master("spark://helk-spark-master:7077") \
.enableHiveSupport() \
.getOrCreate()
In [3]:
spark
Out[3]:
Let's create our first dataframe by using range and toDF functions.
range(start, end=None, step=1, numPartitions=None)
In [4]:
first_df = spark.range(10).toDF("numbers")
In [5]:
first_df.show()
createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True)
In [6]:
dog_data=[['Pedro','Doberman',3],['Clementine','Golden Retriever',8],['Norah','Great Dane',6]\
,['Mabel','Austrailian Shepherd',1],['Bear','Maltese',4],['Bill','Great Dane',10]]
dog_df=spark.createDataFrame(dog_data, ['name','breed','age'])
In [7]:
dog_df.show()
In [8]:
dog_df.schema
Out[8]:
In [9]:
dog_df.printSchema()
Access Dataframes's columns by attribute (df.name):
In [10]:
dog_df.select("name").show()
Access Dataframe's columns by indexing (df['name']).
In [11]:
dog_df.select(dog_df["name"]).show()
Select dogs that are older than 4 years
In [12]:
dog_df.filter(dog_df["age"] > 4).show()
group dogs and count them by their age
In [13]:
dog_df.groupBy(dog_df["age"]).count().show()
Register the current Dataframe as a SQL temporary view
In [14]:
dog_df.createOrReplaceTempView("dogs")
sql_dog_df = spark.sql("SELECT * FROM dogs")
sql_dog_df.show()
In [15]:
sql_dog_df = spark.sql("SELECT * FROM dogs WHERE name='Pedro'")
sql_dog_df.show()