In [1]:

    
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

`pyspark.sql.functions` functions

pyspark.sql.functions is collection of built-in functions for creating column expressions. These functions largely increase methods that we can use to manipulate DataFrame and DataFrame columns.

There are many sql functions from the pyspark.sql.functions module. Here I only choose a few to show how these functions extend the ability to create column expressions.



In [2]:

    
from pyspark.sql import functions as F

`abs()`: create column expression that returns absolute values of a column



In [3]:

    
from pyspark.sql import Row
df = sc.parallelize([Row(x=1), Row(x=-1), Row(x=-2)]).toDF()
df.show()









    



+---+
|  x|
+---+
|  1|
| -1|
| -2|
+---+



In [4]:

    
x_abs = F.abs(df.x)
x_abs









    Out[4]:





Column<b'abs(x)'>



In [5]:

    
df.select(df.x, x_abs).show()









    



+---+------+
|  x|abs(x)|
+---+------+
|  1|     1|
| -1|     1|
| -2|     2|
+---+------+

`concat()`: create column expression that concatenates multiple column values into one



In [6]:

    
df = sc.parallelize([Row(a='apple', b='tree'), Row(a='orange', b='flowers')]).toDF()
df.show()









    



+------+-------+
|     a|      b|
+------+-------+
| apple|   tree|
|orange|flowers|
+------+-------+



In [7]:

    
ab_concat = F.concat(df.a, df.b)
ab_concat









    Out[7]:





Column<b'concat(a, b)'>



In [8]:

    
df.select(df.a, df.b, ab_concat).show()









    



+------+-------+-------------+
|     a|      b| concat(a, b)|
+------+-------+-------------+
| apple|   tree|    appletree|
|orange|flowers|orangeflowers|
+------+-------+-------------+

`corr()`: create column expression that returns pearson correlation coefficient between two columns



In [9]:

    
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
mtcars.show(5)









    



+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              _c0| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



In [10]:

    
drat_wt_corr = F.corr(mtcars.drat, mtcars.wt)
drat_wt_corr









    Out[10]:





Column<b'corr(drat, wt)'>



In [11]:

    
mtcars.select(drat_wt_corr).show()









    



+-------------------+
|     corr(drat, wt)|
+-------------------+
|-0.7124406466973717|
+-------------------+

`array()`: create column expression that merge multiple column values into an array

This function can be used to build feature column in machine learning models.



In [12]:

    
cols = [eval('mtcars.' + col) for col in mtcars.columns[1:]]
cols









    Out[12]:





[Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>]



In [13]:

    
cols_array = F.array(cols)
cols_array









    Out[13]:





Column<b'array(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)'>



In [14]:

    
mtcars.select(cols_array).show(truncate=False)









    



+-----------------------------------------------------------------+
|array(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)    |
+-----------------------------------------------------------------+
|[21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]  |
|[21.0, 6.0, 160.0, 110.0, 3.9, 2.875, 17.02, 0.0, 1.0, 4.0, 4.0] |
|[22.8, 4.0, 108.0, 93.0, 3.85, 2.32, 18.61, 1.0, 1.0, 4.0, 1.0]  |
|[21.4, 6.0, 258.0, 110.0, 3.08, 3.215, 19.44, 1.0, 0.0, 3.0, 1.0]|
|[18.7, 8.0, 360.0, 175.0, 3.15, 3.44, 17.02, 0.0, 0.0, 3.0, 2.0] |
|[18.1, 6.0, 225.0, 105.0, 2.76, 3.46, 20.22, 1.0, 0.0, 3.0, 1.0] |
|[14.3, 8.0, 360.0, 245.0, 3.21, 3.57, 15.84, 0.0, 0.0, 3.0, 4.0] |
|[24.4, 4.0, 146.7, 62.0, 3.69, 3.19, 20.0, 1.0, 0.0, 4.0, 2.0]   |
|[22.8, 4.0, 140.8, 95.0, 3.92, 3.15, 22.9, 1.0, 0.0, 4.0, 2.0]   |
|[19.2, 6.0, 167.6, 123.0, 3.92, 3.44, 18.3, 1.0, 0.0, 4.0, 4.0]  |
|[17.8, 6.0, 167.6, 123.0, 3.92, 3.44, 18.9, 1.0, 0.0, 4.0, 4.0]  |
|[16.4, 8.0, 275.8, 180.0, 3.07, 4.07, 17.4, 0.0, 0.0, 3.0, 3.0]  |
|[17.3, 8.0, 275.8, 180.0, 3.07, 3.73, 17.6, 0.0, 0.0, 3.0, 3.0]  |
|[15.2, 8.0, 275.8, 180.0, 3.07, 3.78, 18.0, 0.0, 0.0, 3.0, 3.0]  |
|[10.4, 8.0, 472.0, 205.0, 2.93, 5.25, 17.98, 0.0, 0.0, 3.0, 4.0] |
|[10.4, 8.0, 460.0, 215.0, 3.0, 5.424, 17.82, 0.0, 0.0, 3.0, 4.0] |
|[14.7, 8.0, 440.0, 230.0, 3.23, 5.345, 17.42, 0.0, 0.0, 3.0, 4.0]|
|[32.4, 4.0, 78.7, 66.0, 4.08, 2.2, 19.47, 1.0, 1.0, 4.0, 1.0]    |
|[30.4, 4.0, 75.7, 52.0, 4.93, 1.615, 18.52, 1.0, 1.0, 4.0, 2.0]  |
|[33.9, 4.0, 71.1, 65.0, 4.22, 1.835, 19.9, 1.0, 1.0, 4.0, 1.0]   |
+-----------------------------------------------------------------+
only showing top 20 rows



In [ ]:

pyspark.sql.functions functions

abs(): create column expression that returns absolute values of a column

concat(): create column expression that concatenates multiple column values into one

corr(): create column expression that returns pearson correlation coefficient between two columns

array(): create column expression that merge multiple column values into an array

`pyspark.sql.functions` functions

`abs()`: create column expression that returns absolute values of a column

`concat()`: create column expression that concatenates multiple column values into one

`corr()`: create column expression that returns pearson correlation coefficient between two columns

`array()`: create column expression that merge multiple column values into an array