In [1]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

pyspark.sql.functions functions

pyspark.sql.functions is collection of built-in functions for creating column expressions. These functions largely increase methods that we can use to manipulate DataFrame and DataFrame columns.

There are many sql functions from the pyspark.sql.functions module. Here I only choose a few to show how these functions extend the ability to create column expressions.


In [2]:
from pyspark.sql import functions as F

abs(): create column expression that returns absolute values of a column


In [3]:
from pyspark.sql import Row
df = sc.parallelize([Row(x=1), Row(x=-1), Row(x=-2)]).toDF()
df.show()


+---+
|  x|
+---+
|  1|
| -1|
| -2|
+---+


In [4]:
x_abs = F.abs(df.x)
x_abs


Out[4]:
Column<b'abs(x)'>

In [5]:
df.select(df.x, x_abs).show()


+---+------+
|  x|abs(x)|
+---+------+
|  1|     1|
| -1|     1|
| -2|     2|
+---+------+

concat(): create column expression that concatenates multiple column values into one


In [6]:
df = sc.parallelize([Row(a='apple', b='tree'), Row(a='orange', b='flowers')]).toDF()
df.show()


+------+-------+
|     a|      b|
+------+-------+
| apple|   tree|
|orange|flowers|
+------+-------+


In [7]:
ab_concat = F.concat(df.a, df.b)
ab_concat


Out[7]:
Column<b'concat(a, b)'>

In [8]:
df.select(df.a, df.b, ab_concat).show()


+------+-------+-------------+
|     a|      b| concat(a, b)|
+------+-------+-------------+
| apple|   tree|    appletree|
|orange|flowers|orangeflowers|
+------+-------+-------------+

corr(): create column expression that returns pearson correlation coefficient between two columns


In [9]:
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
mtcars.show(5)


+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              _c0| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows


In [10]:
drat_wt_corr = F.corr(mtcars.drat, mtcars.wt)
drat_wt_corr


Out[10]:
Column<b'corr(drat, wt)'>

In [11]:
mtcars.select(drat_wt_corr).show()


+-------------------+
|     corr(drat, wt)|
+-------------------+
|-0.7124406466973717|
+-------------------+

array(): create column expression that merge multiple column values into an array

This function can be used to build feature column in machine learning models.


In [12]:
cols = [eval('mtcars.' + col) for col in mtcars.columns[1:]]
cols


Out[12]:
[Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>]

In [13]:
cols_array = F.array(cols)
cols_array


Out[13]:
Column<b'array(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)'>

In [14]:
mtcars.select(cols_array).show(truncate=False)


+-----------------------------------------------------------------+
|array(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)    |
+-----------------------------------------------------------------+
|[21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]  |
|[21.0, 6.0, 160.0, 110.0, 3.9, 2.875, 17.02, 0.0, 1.0, 4.0, 4.0] |
|[22.8, 4.0, 108.0, 93.0, 3.85, 2.32, 18.61, 1.0, 1.0, 4.0, 1.0]  |
|[21.4, 6.0, 258.0, 110.0, 3.08, 3.215, 19.44, 1.0, 0.0, 3.0, 1.0]|
|[18.7, 8.0, 360.0, 175.0, 3.15, 3.44, 17.02, 0.0, 0.0, 3.0, 2.0] |
|[18.1, 6.0, 225.0, 105.0, 2.76, 3.46, 20.22, 1.0, 0.0, 3.0, 1.0] |
|[14.3, 8.0, 360.0, 245.0, 3.21, 3.57, 15.84, 0.0, 0.0, 3.0, 4.0] |
|[24.4, 4.0, 146.7, 62.0, 3.69, 3.19, 20.0, 1.0, 0.0, 4.0, 2.0]   |
|[22.8, 4.0, 140.8, 95.0, 3.92, 3.15, 22.9, 1.0, 0.0, 4.0, 2.0]   |
|[19.2, 6.0, 167.6, 123.0, 3.92, 3.44, 18.3, 1.0, 0.0, 4.0, 4.0]  |
|[17.8, 6.0, 167.6, 123.0, 3.92, 3.44, 18.9, 1.0, 0.0, 4.0, 4.0]  |
|[16.4, 8.0, 275.8, 180.0, 3.07, 4.07, 17.4, 0.0, 0.0, 3.0, 3.0]  |
|[17.3, 8.0, 275.8, 180.0, 3.07, 3.73, 17.6, 0.0, 0.0, 3.0, 3.0]  |
|[15.2, 8.0, 275.8, 180.0, 3.07, 3.78, 18.0, 0.0, 0.0, 3.0, 3.0]  |
|[10.4, 8.0, 472.0, 205.0, 2.93, 5.25, 17.98, 0.0, 0.0, 3.0, 4.0] |
|[10.4, 8.0, 460.0, 215.0, 3.0, 5.424, 17.82, 0.0, 0.0, 3.0, 4.0] |
|[14.7, 8.0, 440.0, 230.0, 3.23, 5.345, 17.42, 0.0, 0.0, 3.0, 4.0]|
|[32.4, 4.0, 78.7, 66.0, 4.08, 2.2, 19.47, 1.0, 1.0, 4.0, 1.0]    |
|[30.4, 4.0, 75.7, 52.0, 4.93, 1.615, 18.52, 1.0, 1.0, 4.0, 2.0]  |
|[33.9, 4.0, 71.1, 65.0, 4.22, 1.835, 19.9, 1.0, 1.0, 4.0, 1.0]   |
+-----------------------------------------------------------------+
only showing top 20 rows


In [ ]: