In [1]:
# create entry points to spark
try:
sc.stop()
except:
pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)
In [2]:
mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)
# correct first column name
mtcars = mtcars.withColumnRenamed('_c0', 'model')
mtcars.show(5)
In [3]:
mtcars.select(['hp', 'disp']).show(5)
In [5]:
indices = [0,3,4,7]
selected_columns = [mtcars.columns[index] for index in indices]
selected_columns
Out[5]:
In [6]:
mtcars.select(selected_columns).show(5)
In [8]:
import re
selected_columns = [x for x in mtcars.columns if re.compile('^d').match(x) is not None]
selected_columns
Out[8]:
In [9]:
mtcars.select(selected_columns).show(5)
In [ ]: