In [1]:

    
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

Boolean column expression

Create a column expression that will return boolean values.

Example data



In [3]:

    
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
mtcars = mtcars.withColumnRenamed('_c0', 'model')
mtcars.show()









    



+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|
|           Merc 280|19.2|  6|167.6|123|3.92| 3.44| 18.3|  1|  0|   4|   4|
|          Merc 280C|17.8|  6|167.6|123|3.92| 3.44| 18.9|  1|  0|   4|   4|
|         Merc 450SE|16.4|  8|275.8|180|3.07| 4.07| 17.4|  0|  0|   3|   3|
|         Merc 450SL|17.3|  8|275.8|180|3.07| 3.73| 17.6|  0|  0|   3|   3|
|        Merc 450SLC|15.2|  8|275.8|180|3.07| 3.78| 18.0|  0|  0|   3|   3|
| Cadillac Fleetwood|10.4|  8|472.0|205|2.93| 5.25|17.98|  0|  0|   3|   4|
|Lincoln Continental|10.4|  8|460.0|215| 3.0|5.424|17.82|  0|  0|   3|   4|
|  Chrysler Imperial|14.7|  8|440.0|230|3.23|5.345|17.42|  0|  0|   3|   4|
|           Fiat 128|32.4|  4| 78.7| 66|4.08|  2.2|19.47|  1|  1|   4|   1|
|        Honda Civic|30.4|  4| 75.7| 52|4.93|1.615|18.52|  1|  1|   4|   2|
|     Toyota Corolla|33.9|  4| 71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 20 rows

`between()`: true/false if the column value is between a given range



In [4]:

    
mpg_between = mtcars.cyl.between(4,6)
mpg_between









    Out[4]:





Column<b'((cyl >= 4) AND (cyl <= 6))'>



In [5]:

    
mtcars.select(mtcars.cyl, mpg_between).show(5)









    



+---+---------------------------+
|cyl|((cyl >= 4) AND (cyl <= 6))|
+---+---------------------------+
|  6|                       true|
|  6|                       true|
|  4|                       true|
|  6|                       true|
|  8|                      false|
+---+---------------------------+
only showing top 5 rows

`contains()`: true/false if the column value contains a string



In [6]:

    
model_contains = mtcars.model.contains('Ho')
model_contains









    Out[6]:





Column<b'contains(model, Ho)'>



In [7]:

    
mtcars.select(mtcars.model, model_contains).show(5)









    



+-----------------+-------------------+
|            model|contains(model, Ho)|
+-----------------+-------------------+
|        Mazda RX4|              false|
|    Mazda RX4 Wag|              false|
|       Datsun 710|              false|
|   Hornet 4 Drive|               true|
|Hornet Sportabout|               true|
+-----------------+-------------------+
only showing top 5 rows

`endswith()`: true/false if the column value ends with a string



In [8]:

    
model_endswith = mtcars.model.endswith('t')
model_endswith









    Out[8]:





Column<b'endswith(model, t)'>



In [9]:

    
mtcars.select(mtcars.model, model_endswith).show(6)









    



+-----------------+------------------+
|            model|endswith(model, t)|
+-----------------+------------------+
|        Mazda RX4|             false|
|    Mazda RX4 Wag|             false|
|       Datsun 710|             false|
|   Hornet 4 Drive|             false|
|Hornet Sportabout|              true|
|          Valiant|              true|
+-----------------+------------------+
only showing top 6 rows

`isNotNull()`: true/false if the column value is not Null



In [10]:

    
from pyspark.sql import Row
df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
df.show()









    



+------+-----+
|height| name|
+------+-----+
|    80|  Tom|
|  null|Alice|
+------+-----+



In [11]:

    
height_isNotNull = df.height.isNotNull()
height_isNotNull









    Out[11]:





Column<b'(height IS NOT NULL)'>



In [12]:

    
df.select(df.height, height_isNotNull).show()









    



+------+--------------------+
|height|(height IS NOT NULL)|
+------+--------------------+
|    80|                true|
|  null|               false|
+------+--------------------+

`isNull()`: true/false if the column value is Null



In [13]:

    
height_isNull = df.height.isNull()
height_isNull









    Out[13]:





Column<b'(height IS NULL)'>



In [14]:

    
df.select(df.height, height_isNull).show()









    



+------+----------------+
|height|(height IS NULL)|
+------+----------------+
|    80|           false|
|  null|            true|
+------+----------------+

`isin()`: true/false if the column value is contained by the evaluated argument



In [15]:

    
carb_isin = mtcars.carb.isin([2, 3])
carb_isin









    Out[15]:





Column<b'(carb IN (2, 3))'>



In [16]:

    
mtcars.select(mtcars.carb, carb_isin).show(10)









    



+----+----------------+
|carb|(carb IN (2, 3))|
+----+----------------+
|   4|           false|
|   4|           false|
|   1|           false|
|   1|           false|
|   2|            true|
|   1|           false|
|   4|           false|
|   2|            true|
|   2|            true|
|   4|           false|
+----+----------------+
only showing top 10 rows

`like()`: true/false if the column value matches a pattern based on a SQL LIKE



In [17]:

    
model_like = mtcars.model.like('Ho%')
model_like









    Out[17]:





Column<b'model LIKE Ho%'>



In [18]:

    
mtcars.select(mtcars.model, model_like).show(10)









    



+-----------------+--------------+
|            model|model LIKE Ho%|
+-----------------+--------------+
|        Mazda RX4|         false|
|    Mazda RX4 Wag|         false|
|       Datsun 710|         false|
|   Hornet 4 Drive|          true|
|Hornet Sportabout|          true|
|          Valiant|         false|
|       Duster 360|         false|
|        Merc 240D|         false|
|         Merc 230|         false|
|         Merc 280|         false|
+-----------------+--------------+
only showing top 10 rows

`rlike()`: true/false if the column value matches a pattern based on a SQL RLIKE (LIKE with Regex)



In [19]:

    
model_rlike = mtcars.model.rlike('t$')
model_rlike









    Out[19]:





Column<b'model RLIKE t$'>



In [20]:

    
mtcars.select(mtcars.model, model_rlike).show()









    



+-------------------+--------------+
|              model|model RLIKE t$|
+-------------------+--------------+
|          Mazda RX4|         false|
|      Mazda RX4 Wag|         false|
|         Datsun 710|         false|
|     Hornet 4 Drive|         false|
|  Hornet Sportabout|          true|
|            Valiant|          true|
|         Duster 360|         false|
|          Merc 240D|         false|
|           Merc 230|         false|
|           Merc 280|         false|
|          Merc 280C|         false|
|         Merc 450SE|         false|
|         Merc 450SL|         false|
|        Merc 450SLC|         false|
| Cadillac Fleetwood|         false|
|Lincoln Continental|         false|
|  Chrysler Imperial|         false|
|           Fiat 128|         false|
|        Honda Civic|         false|
|     Toyota Corolla|         false|
+-------------------+--------------+
only showing top 20 rows

`startswith()`: true/false if the column value starts with a string



In [21]:

    
model_startswith = mtcars.model.startswith('Merc')
model_startswith









    Out[21]:





Column<b'startswith(model, Merc)'>



In [22]:

    
mtcars.select(mtcars.model, model_startswith).show()









    



+-------------------+-----------------------+
|              model|startswith(model, Merc)|
+-------------------+-----------------------+
|          Mazda RX4|                  false|
|      Mazda RX4 Wag|                  false|
|         Datsun 710|                  false|
|     Hornet 4 Drive|                  false|
|  Hornet Sportabout|                  false|
|            Valiant|                  false|
|         Duster 360|                  false|
|          Merc 240D|                   true|
|           Merc 230|                   true|
|           Merc 280|                   true|
|          Merc 280C|                   true|
|         Merc 450SE|                   true|
|         Merc 450SL|                   true|
|        Merc 450SLC|                   true|
| Cadillac Fleetwood|                  false|
|Lincoln Continental|                  false|
|  Chrysler Imperial|                  false|
|           Fiat 128|                  false|
|        Honda Civic|                  false|
|     Toyota Corolla|                  false|
+-------------------+-----------------------+
only showing top 20 rows



In [ ]:

Boolean column expression

Example data

between(): true/false if the column value is between a given range

contains(): true/false if the column value contains a string

endswith(): true/false if the column value ends with a string

isNotNull(): true/false if the column value is not Null

isNull(): true/false if the column value is Null

isin(): true/false if the column value is contained by the evaluated argument

like(): true/false if the column value matches a pattern based on a SQL LIKE

rlike(): true/false if the column value matches a pattern based on a SQL RLIKE (LIKE with Regex)

startswith(): true/false if the column value starts with a string

`between()`: true/false if the column value is between a given range

`contains()`: true/false if the column value contains a string

`endswith()`: true/false if the column value ends with a string

`isNotNull()`: true/false if the column value is not Null

`isNull()`: true/false if the column value is Null

`isin()`: true/false if the column value is contained by the evaluated argument

`like()`: true/false if the column value matches a pattern based on a SQL LIKE

`rlike()`: true/false if the column value matches a pattern based on a SQL RLIKE (LIKE with Regex)

`startswith()`: true/false if the column value starts with a string