In [1]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

Example data


In [2]:
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
# correct first column name
mtcars = mtcars.withColumnRenamed('_c0', 'model')
mtcars.show(5)


+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows

Select columns by name

We can simply use the select() function to select columns by name.


In [3]:
mtcars.select(['hp', 'disp']).show(5)


+---+-----+
| hp| disp|
+---+-----+
|110|160.0|
|110|160.0|
| 93|108.0|
|110|258.0|
|175|360.0|
+---+-----+
only showing top 5 rows

Select columns by index

We can convert indices to corresponding column names and then select columns by name.


In [5]:
indices = [0,3,4,7]
selected_columns =  [mtcars.columns[index] for index in indices]
selected_columns


Out[5]:
['model', 'disp', 'hp', 'qsec']

In [6]:
mtcars.select(selected_columns).show(5)


+-----------------+-----+---+-----+
|            model| disp| hp| qsec|
+-----------------+-----+---+-----+
|        Mazda RX4|160.0|110|16.46|
|    Mazda RX4 Wag|160.0|110|17.02|
|       Datsun 710|108.0| 93|18.61|
|   Hornet 4 Drive|258.0|110|19.44|
|Hornet Sportabout|360.0|175|17.02|
+-----------------+-----+---+-----+
only showing top 5 rows

Select columns by pattern

Example: columns start with 'd'.


In [8]:
import re
selected_columns = [x for x in mtcars.columns if re.compile('^d').match(x) is not None]
selected_columns


Out[8]:
['disp', 'drat']

In [9]:
mtcars.select(selected_columns).show(5)


+-----+----+
| disp|drat|
+-----+----+
|160.0| 3.9|
|160.0| 3.9|
|108.0|3.85|
|258.0|3.08|
|360.0|3.15|
+-----+----+
only showing top 5 rows


In [ ]: