In [1]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

Boolean column expression

Create a column expression that will return boolean values.

Example data


In [3]:
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
mtcars = mtcars.withColumnRenamed('_c0', 'model')
mtcars.show()


+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|
|           Merc 280|19.2|  6|167.6|123|3.92| 3.44| 18.3|  1|  0|   4|   4|
|          Merc 280C|17.8|  6|167.6|123|3.92| 3.44| 18.9|  1|  0|   4|   4|
|         Merc 450SE|16.4|  8|275.8|180|3.07| 4.07| 17.4|  0|  0|   3|   3|
|         Merc 450SL|17.3|  8|275.8|180|3.07| 3.73| 17.6|  0|  0|   3|   3|
|        Merc 450SLC|15.2|  8|275.8|180|3.07| 3.78| 18.0|  0|  0|   3|   3|
| Cadillac Fleetwood|10.4|  8|472.0|205|2.93| 5.25|17.98|  0|  0|   3|   4|
|Lincoln Continental|10.4|  8|460.0|215| 3.0|5.424|17.82|  0|  0|   3|   4|
|  Chrysler Imperial|14.7|  8|440.0|230|3.23|5.345|17.42|  0|  0|   3|   4|
|           Fiat 128|32.4|  4| 78.7| 66|4.08|  2.2|19.47|  1|  1|   4|   1|
|        Honda Civic|30.4|  4| 75.7| 52|4.93|1.615|18.52|  1|  1|   4|   2|
|     Toyota Corolla|33.9|  4| 71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 20 rows

between(): true/false if the column value is between a given range


In [4]:
mpg_between = mtcars.cyl.between(4,6)
mpg_between


Out[4]:
Column<b'((cyl >= 4) AND (cyl <= 6))'>

In [5]:
mtcars.select(mtcars.cyl, mpg_between).show(5)


+---+---------------------------+
|cyl|((cyl >= 4) AND (cyl <= 6))|
+---+---------------------------+
|  6|                       true|
|  6|                       true|
|  4|                       true|
|  6|                       true|
|  8|                      false|
+---+---------------------------+
only showing top 5 rows

contains(): true/false if the column value contains a string


In [6]:
model_contains = mtcars.model.contains('Ho')
model_contains


Out[6]:
Column<b'contains(model, Ho)'>

In [7]:
mtcars.select(mtcars.model, model_contains).show(5)


+-----------------+-------------------+
|            model|contains(model, Ho)|
+-----------------+-------------------+
|        Mazda RX4|              false|
|    Mazda RX4 Wag|              false|
|       Datsun 710|              false|
|   Hornet 4 Drive|               true|
|Hornet Sportabout|               true|
+-----------------+-------------------+
only showing top 5 rows

endswith(): true/false if the column value ends with a string


In [8]:
model_endswith = mtcars.model.endswith('t')
model_endswith


Out[8]:
Column<b'endswith(model, t)'>

In [9]:
mtcars.select(mtcars.model, model_endswith).show(6)


+-----------------+------------------+
|            model|endswith(model, t)|
+-----------------+------------------+
|        Mazda RX4|             false|
|    Mazda RX4 Wag|             false|
|       Datsun 710|             false|
|   Hornet 4 Drive|             false|
|Hornet Sportabout|              true|
|          Valiant|              true|
+-----------------+------------------+
only showing top 6 rows

isNotNull(): true/false if the column value is not Null


In [10]:
from pyspark.sql import Row
df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
df.show()


+------+-----+
|height| name|
+------+-----+
|    80|  Tom|
|  null|Alice|
+------+-----+


In [11]:
height_isNotNull = df.height.isNotNull()
height_isNotNull


Out[11]:
Column<b'(height IS NOT NULL)'>

In [12]:
df.select(df.height, height_isNotNull).show()


+------+--------------------+
|height|(height IS NOT NULL)|
+------+--------------------+
|    80|                true|
|  null|               false|
+------+--------------------+

isNull(): true/false if the column value is Null


In [13]:
height_isNull = df.height.isNull()
height_isNull


Out[13]:
Column<b'(height IS NULL)'>

In [14]:
df.select(df.height, height_isNull).show()


+------+----------------+
|height|(height IS NULL)|
+------+----------------+
|    80|           false|
|  null|            true|
+------+----------------+

isin(): true/false if the column value is contained by the evaluated argument


In [15]:
carb_isin = mtcars.carb.isin([2, 3])
carb_isin


Out[15]:
Column<b'(carb IN (2, 3))'>

In [16]:
mtcars.select(mtcars.carb, carb_isin).show(10)


+----+----------------+
|carb|(carb IN (2, 3))|
+----+----------------+
|   4|           false|
|   4|           false|
|   1|           false|
|   1|           false|
|   2|            true|
|   1|           false|
|   4|           false|
|   2|            true|
|   2|            true|
|   4|           false|
+----+----------------+
only showing top 10 rows

like(): true/false if the column value matches a pattern based on a SQL LIKE


In [17]:
model_like = mtcars.model.like('Ho%')
model_like


Out[17]:
Column<b'model LIKE Ho%'>

In [18]:
mtcars.select(mtcars.model, model_like).show(10)


+-----------------+--------------+
|            model|model LIKE Ho%|
+-----------------+--------------+
|        Mazda RX4|         false|
|    Mazda RX4 Wag|         false|
|       Datsun 710|         false|
|   Hornet 4 Drive|          true|
|Hornet Sportabout|          true|
|          Valiant|         false|
|       Duster 360|         false|
|        Merc 240D|         false|
|         Merc 230|         false|
|         Merc 280|         false|
+-----------------+--------------+
only showing top 10 rows

rlike(): true/false if the column value matches a pattern based on a SQL RLIKE (LIKE with Regex)


In [19]:
model_rlike = mtcars.model.rlike('t$')
model_rlike


Out[19]:
Column<b'model RLIKE t$'>

In [20]:
mtcars.select(mtcars.model, model_rlike).show()


+-------------------+--------------+
|              model|model RLIKE t$|
+-------------------+--------------+
|          Mazda RX4|         false|
|      Mazda RX4 Wag|         false|
|         Datsun 710|         false|
|     Hornet 4 Drive|         false|
|  Hornet Sportabout|          true|
|            Valiant|          true|
|         Duster 360|         false|
|          Merc 240D|         false|
|           Merc 230|         false|
|           Merc 280|         false|
|          Merc 280C|         false|
|         Merc 450SE|         false|
|         Merc 450SL|         false|
|        Merc 450SLC|         false|
| Cadillac Fleetwood|         false|
|Lincoln Continental|         false|
|  Chrysler Imperial|         false|
|           Fiat 128|         false|
|        Honda Civic|         false|
|     Toyota Corolla|         false|
+-------------------+--------------+
only showing top 20 rows

startswith(): true/false if the column value starts with a string


In [21]:
model_startswith = mtcars.model.startswith('Merc')
model_startswith


Out[21]:
Column<b'startswith(model, Merc)'>

In [22]:
mtcars.select(mtcars.model, model_startswith).show()


+-------------------+-----------------------+
|              model|startswith(model, Merc)|
+-------------------+-----------------------+
|          Mazda RX4|                  false|
|      Mazda RX4 Wag|                  false|
|         Datsun 710|                  false|
|     Hornet 4 Drive|                  false|
|  Hornet Sportabout|                  false|
|            Valiant|                  false|
|         Duster 360|                  false|
|          Merc 240D|                   true|
|           Merc 230|                   true|
|           Merc 280|                   true|
|          Merc 280C|                   true|
|         Merc 450SE|                   true|
|         Merc 450SL|                   true|
|        Merc 450SLC|                   true|
| Cadillac Fleetwood|                  false|
|Lincoln Continental|                  false|
|  Chrysler Imperial|                  false|
|           Fiat 128|                  false|
|        Honda Civic|                  false|
|     Toyota Corolla|                  false|
+-------------------+-----------------------+
only showing top 20 rows


In [ ]: