In [1]:
# create entry points to spark
try:
sc.stop()
except:
pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)
pyspark.sql.functions
functionspyspark.sql.functions
is collection of built-in functions for creating column expressions. These functions largely increase methods that we can use to manipulate DataFrame and DataFrame columns.
There are many sql functions from the pyspark.sql.functions
module. Here I only choose a few to show how these functions extend the ability to create column expressions.
In [2]:
from pyspark.sql import functions as F
In [3]:
from pyspark.sql import Row
df = sc.parallelize([Row(x=1), Row(x=-1), Row(x=-2)]).toDF()
df.show()
In [4]:
x_abs = F.abs(df.x)
x_abs
Out[4]:
In [5]:
df.select(df.x, x_abs).show()
In [6]:
df = sc.parallelize([Row(a='apple', b='tree'), Row(a='orange', b='flowers')]).toDF()
df.show()
In [7]:
ab_concat = F.concat(df.a, df.b)
ab_concat
Out[7]:
In [8]:
df.select(df.a, df.b, ab_concat).show()
In [9]:
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
mtcars.show(5)
In [10]:
drat_wt_corr = F.corr(mtcars.drat, mtcars.wt)
drat_wt_corr
Out[10]:
In [11]:
mtcars.select(drat_wt_corr).show()
In [12]:
cols = [eval('mtcars.' + col) for col in mtcars.columns[1:]]
cols
Out[12]:
In [13]:
cols_array = F.array(cols)
cols_array
Out[13]:
In [14]:
mtcars.select(cols_array).show(truncate=False)
In [ ]: